]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 8d3093bd9b bcachefs: Evict btree nodes we're deleting
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "error.h"
11
12 #include <linux/prefetch.h>
13 #include <linux/sched/mm.h>
14 #include <trace/events/bcachefs.h>
15
16 void bch2_recalc_btree_reserve(struct bch_fs *c)
17 {
18         unsigned i, reserve = 16;
19
20         if (!c->btree_roots[0].b)
21                 reserve += 8;
22
23         for (i = 0; i < BTREE_ID_NR; i++)
24                 if (c->btree_roots[i].b)
25                         reserve += min_t(unsigned, 1,
26                                          c->btree_roots[i].b->c.level) * 8;
27
28         c->btree_cache.reserve = reserve;
29 }
30
31 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
32 {
33         return max_t(int, 0, bc->used - bc->reserve);
34 }
35
36 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
37 {
38         struct btree_cache *bc = &c->btree_cache;
39
40         EBUG_ON(btree_node_write_in_flight(b));
41
42         kvpfree(b->data, btree_bytes(c));
43         b->data = NULL;
44 #ifdef __KERNEL__
45         vfree(b->aux_data);
46 #else
47         munmap(b->aux_data, btree_aux_data_bytes(b));
48 #endif
49         b->aux_data = NULL;
50
51         bc->used--;
52         list_move(&b->list, &bc->freed);
53 }
54
55 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
56                                    const void *obj)
57 {
58         const struct btree *b = obj;
59         const u64 *v = arg->key;
60
61         return b->hash_val == *v ? 0 : 1;
62 }
63
64 static const struct rhashtable_params bch_btree_cache_params = {
65         .head_offset    = offsetof(struct btree, hash),
66         .key_offset     = offsetof(struct btree, hash_val),
67         .key_len        = sizeof(u64),
68         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
69 };
70
71 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
72 {
73         BUG_ON(b->data || b->aux_data);
74
75         b->data = kvpmalloc(btree_bytes(c), gfp);
76         if (!b->data)
77                 return -ENOMEM;
78 #ifdef __KERNEL__
79         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
80 #else
81         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
82                            PROT_READ|PROT_WRITE|PROT_EXEC,
83                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
84 #endif
85         if (!b->aux_data) {
86                 kvpfree(b->data, btree_bytes(c));
87                 b->data = NULL;
88                 return -ENOMEM;
89         }
90
91         return 0;
92 }
93
94 static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
95 {
96         struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
97         if (!b)
98                 return NULL;
99
100         bkey_btree_ptr_init(&b->key);
101         six_lock_init(&b->c.lock);
102         INIT_LIST_HEAD(&b->list);
103         INIT_LIST_HEAD(&b->write_blocked);
104         b->byte_order = ilog2(btree_bytes(c));
105         return b;
106 }
107
108 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
109 {
110         struct btree_cache *bc = &c->btree_cache;
111         struct btree *b = __btree_node_mem_alloc(c);
112         if (!b)
113                 return NULL;
114
115         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
116                 kfree(b);
117                 return NULL;
118         }
119
120         bc->used++;
121         list_add(&b->list, &bc->freeable);
122         return b;
123 }
124
125 /* Btree in memory cache - hash table */
126
127 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
128 {
129         rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
130
131         /* Cause future lookups for this node to fail: */
132         b->hash_val = 0;
133
134         six_lock_wakeup_all(&b->c.lock);
135 }
136
137 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
138 {
139         BUG_ON(b->hash_val);
140         b->hash_val = btree_ptr_hash_val(&b->key);
141
142         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
143                                              bch_btree_cache_params);
144 }
145
146 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
147                                 unsigned level, enum btree_id id)
148 {
149         int ret;
150
151         b->c.level      = level;
152         b->c.btree_id   = id;
153
154         if (level)
155                 six_lock_pcpu_alloc(&b->c.lock);
156         else
157                 six_lock_pcpu_free_rcu(&b->c.lock);
158
159         mutex_lock(&bc->lock);
160         ret = __bch2_btree_node_hash_insert(bc, b);
161         if (!ret)
162                 list_add(&b->list, &bc->live);
163         mutex_unlock(&bc->lock);
164
165         return ret;
166 }
167
168 __flatten
169 static inline struct btree *btree_cache_find(struct btree_cache *bc,
170                                      const struct bkey_i *k)
171 {
172         u64 v = btree_ptr_hash_val(k);
173
174         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
175 }
176
177 /*
178  * this version is for btree nodes that have already been freed (we're not
179  * reaping a real btree node)
180  */
181 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
182 {
183         struct btree_cache *bc = &c->btree_cache;
184         int ret = 0;
185
186         lockdep_assert_held(&bc->lock);
187
188         if (!six_trylock_intent(&b->c.lock))
189                 return -ENOMEM;
190
191         if (!six_trylock_write(&b->c.lock))
192                 goto out_unlock_intent;
193
194         if (btree_node_noevict(b))
195                 goto out_unlock;
196
197         if (!btree_node_may_write(b))
198                 goto out_unlock;
199
200         if (btree_node_dirty(b) &&
201             test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
202                 goto out_unlock;
203
204         if (btree_node_dirty(b) ||
205             btree_node_write_in_flight(b) ||
206             btree_node_read_in_flight(b)) {
207                 if (!flush)
208                         goto out_unlock;
209
210                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
211                                TASK_UNINTERRUPTIBLE);
212
213                 /*
214                  * Using the underscore version because we don't want to compact
215                  * bsets after the write, since this node is about to be evicted
216                  * - unless btree verify mode is enabled, since it runs out of
217                  * the post write cleanup:
218                  */
219                 if (bch2_verify_btree_ondisk)
220                         bch2_btree_node_write(c, b, SIX_LOCK_intent);
221                 else
222                         __bch2_btree_node_write(c, b);
223
224                 /* wait for any in flight btree write */
225                 btree_node_wait_on_io(b);
226         }
227 out:
228         if (b->hash_val && !ret)
229                 trace_btree_node_reap(c, b);
230         return ret;
231 out_unlock:
232         six_unlock_write(&b->c.lock);
233 out_unlock_intent:
234         six_unlock_intent(&b->c.lock);
235         ret = -ENOMEM;
236         goto out;
237 }
238
239 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
240 {
241         return __btree_node_reclaim(c, b, false);
242 }
243
244 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
245 {
246         return __btree_node_reclaim(c, b, true);
247 }
248
249 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
250                                            struct shrink_control *sc)
251 {
252         struct bch_fs *c = container_of(shrink, struct bch_fs,
253                                         btree_cache.shrink);
254         struct btree_cache *bc = &c->btree_cache;
255         struct btree *b, *t;
256         unsigned long nr = sc->nr_to_scan;
257         unsigned long can_free;
258         unsigned long touched = 0;
259         unsigned long freed = 0;
260         unsigned i, flags;
261
262         if (bch2_btree_shrinker_disabled)
263                 return SHRINK_STOP;
264
265         /* Return -1 if we can't do anything right now */
266         if (sc->gfp_mask & __GFP_FS)
267                 mutex_lock(&bc->lock);
268         else if (!mutex_trylock(&bc->lock))
269                 return -1;
270
271         flags = memalloc_nofs_save();
272
273         /*
274          * It's _really_ critical that we don't free too many btree nodes - we
275          * have to always leave ourselves a reserve. The reserve is how we
276          * guarantee that allocating memory for a new btree node can always
277          * succeed, so that inserting keys into the btree can always succeed and
278          * IO can always make forward progress:
279          */
280         nr /= btree_pages(c);
281         can_free = btree_cache_can_free(bc);
282         nr = min_t(unsigned long, nr, can_free);
283
284         i = 0;
285         list_for_each_entry_safe(b, t, &bc->freeable, list) {
286                 touched++;
287
288                 if (freed >= nr)
289                         break;
290
291                 if (++i > 3 &&
292                     !btree_node_reclaim(c, b)) {
293                         btree_node_data_free(c, b);
294                         six_unlock_write(&b->c.lock);
295                         six_unlock_intent(&b->c.lock);
296                         freed++;
297                 }
298         }
299 restart:
300         list_for_each_entry_safe(b, t, &bc->live, list) {
301                 touched++;
302
303                 if (freed >= nr) {
304                         /* Save position */
305                         if (&t->list != &bc->live)
306                                 list_move_tail(&bc->live, &t->list);
307                         break;
308                 }
309
310                 if (!btree_node_accessed(b) &&
311                     !btree_node_reclaim(c, b)) {
312                         /* can't call bch2_btree_node_hash_remove under lock  */
313                         freed++;
314                         if (&t->list != &bc->live)
315                                 list_move_tail(&bc->live, &t->list);
316
317                         btree_node_data_free(c, b);
318                         mutex_unlock(&bc->lock);
319
320                         bch2_btree_node_hash_remove(bc, b);
321                         six_unlock_write(&b->c.lock);
322                         six_unlock_intent(&b->c.lock);
323
324                         if (freed >= nr)
325                                 goto out;
326
327                         if (sc->gfp_mask & __GFP_FS)
328                                 mutex_lock(&bc->lock);
329                         else if (!mutex_trylock(&bc->lock))
330                                 goto out;
331                         goto restart;
332                 } else
333                         clear_btree_node_accessed(b);
334         }
335
336         mutex_unlock(&bc->lock);
337 out:
338         memalloc_nofs_restore(flags);
339         return (unsigned long) freed * btree_pages(c);
340 }
341
342 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
343                                             struct shrink_control *sc)
344 {
345         struct bch_fs *c = container_of(shrink, struct bch_fs,
346                                         btree_cache.shrink);
347         struct btree_cache *bc = &c->btree_cache;
348
349         if (bch2_btree_shrinker_disabled)
350                 return 0;
351
352         return btree_cache_can_free(bc) * btree_pages(c);
353 }
354
355 void bch2_fs_btree_cache_exit(struct bch_fs *c)
356 {
357         struct btree_cache *bc = &c->btree_cache;
358         struct btree *b;
359         unsigned i, flags;
360
361         if (bc->shrink.list.next)
362                 unregister_shrinker(&bc->shrink);
363
364         /* vfree() can allocate memory: */
365         flags = memalloc_nofs_save();
366         mutex_lock(&bc->lock);
367
368         if (c->verify_data)
369                 list_move(&c->verify_data->list, &bc->live);
370
371         kvpfree(c->verify_ondisk, btree_bytes(c));
372
373         for (i = 0; i < BTREE_ID_NR; i++)
374                 if (c->btree_roots[i].b)
375                         list_add(&c->btree_roots[i].b->list, &bc->live);
376
377         list_splice(&bc->freeable, &bc->live);
378
379         while (!list_empty(&bc->live)) {
380                 b = list_first_entry(&bc->live, struct btree, list);
381
382                 BUG_ON(btree_node_read_in_flight(b) ||
383                        btree_node_write_in_flight(b));
384
385                 if (btree_node_dirty(b))
386                         bch2_btree_complete_write(c, b, btree_current_write(b));
387                 clear_btree_node_dirty(c, b);
388
389                 btree_node_data_free(c, b);
390         }
391
392         BUG_ON(atomic_read(&c->btree_cache.dirty));
393
394         while (!list_empty(&bc->freed)) {
395                 b = list_first_entry(&bc->freed, struct btree, list);
396                 list_del(&b->list);
397                 six_lock_pcpu_free(&b->c.lock);
398                 kfree(b);
399         }
400
401         mutex_unlock(&bc->lock);
402         memalloc_nofs_restore(flags);
403
404         if (bc->table_init_done)
405                 rhashtable_destroy(&bc->table);
406 }
407
408 int bch2_fs_btree_cache_init(struct bch_fs *c)
409 {
410         struct btree_cache *bc = &c->btree_cache;
411         unsigned i;
412         int ret = 0;
413
414         pr_verbose_init(c->opts, "");
415
416         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
417         if (ret)
418                 goto out;
419
420         bc->table_init_done = true;
421
422         bch2_recalc_btree_reserve(c);
423
424         for (i = 0; i < bc->reserve; i++)
425                 if (!__bch2_btree_node_mem_alloc(c)) {
426                         ret = -ENOMEM;
427                         goto out;
428                 }
429
430         list_splice_init(&bc->live, &bc->freeable);
431
432         mutex_init(&c->verify_lock);
433
434         bc->shrink.count_objects        = bch2_btree_cache_count;
435         bc->shrink.scan_objects         = bch2_btree_cache_scan;
436         bc->shrink.seeks                = 4;
437         bc->shrink.batch                = btree_pages(c) * 2;
438         ret = register_shrinker(&bc->shrink);
439 out:
440         pr_verbose_init(c->opts, "ret %i", ret);
441         return ret;
442 }
443
444 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
445 {
446         mutex_init(&bc->lock);
447         INIT_LIST_HEAD(&bc->live);
448         INIT_LIST_HEAD(&bc->freeable);
449         INIT_LIST_HEAD(&bc->freed);
450 }
451
452 /*
453  * We can only have one thread cannibalizing other cached btree nodes at a time,
454  * or we'll deadlock. We use an open coded mutex to ensure that, which a
455  * cannibalize_bucket() will take. This means every time we unlock the root of
456  * the btree, we need to release this lock if we have it held.
457  */
458 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
459 {
460         struct btree_cache *bc = &c->btree_cache;
461
462         if (bc->alloc_lock == current) {
463                 trace_btree_node_cannibalize_unlock(c);
464                 bc->alloc_lock = NULL;
465                 closure_wake_up(&bc->alloc_wait);
466         }
467 }
468
469 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
470 {
471         struct btree_cache *bc = &c->btree_cache;
472         struct task_struct *old;
473
474         old = cmpxchg(&bc->alloc_lock, NULL, current);
475         if (old == NULL || old == current)
476                 goto success;
477
478         if (!cl) {
479                 trace_btree_node_cannibalize_lock_fail(c);
480                 return -ENOMEM;
481         }
482
483         closure_wait(&bc->alloc_wait, cl);
484
485         /* Try again, after adding ourselves to waitlist */
486         old = cmpxchg(&bc->alloc_lock, NULL, current);
487         if (old == NULL || old == current) {
488                 /* We raced */
489                 closure_wake_up(&bc->alloc_wait);
490                 goto success;
491         }
492
493         trace_btree_node_cannibalize_lock_fail(c);
494         return -EAGAIN;
495
496 success:
497         trace_btree_node_cannibalize_lock(c);
498         return 0;
499 }
500
501 static struct btree *btree_node_cannibalize(struct bch_fs *c)
502 {
503         struct btree_cache *bc = &c->btree_cache;
504         struct btree *b;
505
506         list_for_each_entry_reverse(b, &bc->live, list)
507                 if (!btree_node_reclaim(c, b))
508                         return b;
509
510         while (1) {
511                 list_for_each_entry_reverse(b, &bc->live, list)
512                         if (!btree_node_write_and_reclaim(c, b))
513                                 return b;
514
515                 /*
516                  * Rare case: all nodes were intent-locked.
517                  * Just busy-wait.
518                  */
519                 WARN_ONCE(1, "btree cache cannibalize failed\n");
520                 cond_resched();
521         }
522 }
523
524 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
525 {
526         struct btree_cache *bc = &c->btree_cache;
527         struct btree *b;
528         u64 start_time = local_clock();
529         unsigned flags;
530
531         flags = memalloc_nofs_save();
532         mutex_lock(&bc->lock);
533
534         /*
535          * btree_free() doesn't free memory; it sticks the node on the end of
536          * the list. Check if there's any freed nodes there:
537          */
538         list_for_each_entry(b, &bc->freeable, list)
539                 if (!btree_node_reclaim(c, b))
540                         goto got_node;
541
542         /*
543          * We never free struct btree itself, just the memory that holds the on
544          * disk node. Check the freed list before allocating a new one:
545          */
546         list_for_each_entry(b, &bc->freed, list)
547                 if (!btree_node_reclaim(c, b))
548                         goto got_node;
549
550         b = NULL;
551 got_node:
552         if (b)
553                 list_del_init(&b->list);
554         mutex_unlock(&bc->lock);
555
556         if (!b) {
557                 b = __btree_node_mem_alloc(c);
558                 if (!b)
559                         goto err;
560
561                 BUG_ON(!six_trylock_intent(&b->c.lock));
562                 BUG_ON(!six_trylock_write(&b->c.lock));
563         }
564
565         if (!b->data) {
566                 if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
567                         goto err;
568
569                 mutex_lock(&bc->lock);
570                 bc->used++;
571                 mutex_unlock(&bc->lock);
572         }
573
574         BUG_ON(btree_node_hashed(b));
575         BUG_ON(btree_node_write_in_flight(b));
576 out:
577         b->flags                = 0;
578         b->written              = 0;
579         b->nsets                = 0;
580         b->sib_u64s[0]          = 0;
581         b->sib_u64s[1]          = 0;
582         b->whiteout_u64s        = 0;
583         bch2_btree_keys_init(b);
584
585         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
586                                start_time);
587
588         memalloc_nofs_restore(flags);
589         return b;
590 err:
591         mutex_lock(&bc->lock);
592
593         if (b) {
594                 list_add(&b->list, &bc->freed);
595                 six_unlock_write(&b->c.lock);
596                 six_unlock_intent(&b->c.lock);
597         }
598
599         /* Try to cannibalize another cached btree node: */
600         if (bc->alloc_lock == current) {
601                 b = btree_node_cannibalize(c);
602                 list_del_init(&b->list);
603                 mutex_unlock(&bc->lock);
604
605                 bch2_btree_node_hash_remove(bc, b);
606
607                 trace_btree_node_cannibalize(c);
608                 goto out;
609         }
610
611         mutex_unlock(&bc->lock);
612         memalloc_nofs_restore(flags);
613         return ERR_PTR(-ENOMEM);
614 }
615
616 /* Slowpath, don't want it inlined into btree_iter_traverse() */
617 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
618                                 struct btree_iter *iter,
619                                 const struct bkey_i *k,
620                                 enum btree_id btree_id,
621                                 unsigned level,
622                                 enum six_lock_type lock_type,
623                                 bool sync)
624 {
625         struct btree_cache *bc = &c->btree_cache;
626         struct btree *b;
627
628         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
629         /*
630          * Parent node must be locked, else we could read in a btree node that's
631          * been freed:
632          */
633         if (iter && !bch2_btree_node_relock(iter, level + 1))
634                 return ERR_PTR(-EINTR);
635
636         b = bch2_btree_node_mem_alloc(c);
637         if (IS_ERR(b))
638                 return b;
639
640         bkey_copy(&b->key, k);
641         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
642                 /* raced with another fill: */
643
644                 /* mark as unhashed... */
645                 b->hash_val = 0;
646
647                 mutex_lock(&bc->lock);
648                 list_add(&b->list, &bc->freeable);
649                 mutex_unlock(&bc->lock);
650
651                 six_unlock_write(&b->c.lock);
652                 six_unlock_intent(&b->c.lock);
653                 return NULL;
654         }
655
656         /* Unlock before doing IO: */
657         if (iter && sync)
658                 bch2_trans_unlock(iter->trans);
659
660         bch2_btree_node_read(c, b, sync);
661
662         six_unlock_write(&b->c.lock);
663
664         if (!sync) {
665                 six_unlock_intent(&b->c.lock);
666                 return NULL;
667         }
668
669         /*
670          * XXX: this will probably always fail because btree_iter_relock()
671          * currently fails for iterators that aren't pointed at a valid btree
672          * node
673          */
674         if (iter && !bch2_trans_relock(iter->trans)) {
675                 six_unlock_intent(&b->c.lock);
676                 return ERR_PTR(-EINTR);
677         }
678
679         if (lock_type == SIX_LOCK_read)
680                 six_lock_downgrade(&b->c.lock);
681
682         return b;
683 }
684
685 static int lock_node_check_fn(struct six_lock *lock, void *p)
686 {
687         struct btree *b = container_of(lock, struct btree, c.lock);
688         const struct bkey_i *k = p;
689
690         return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
691 }
692
693 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
694 {
695         char buf1[100], buf2[100], buf3[100], buf4[100];
696
697         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
698                 return;
699
700         bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
701                 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
702                 : POS_MIN);
703         bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
704
705         bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
706         bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
707         bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
708                              "btree: ptr %u header %llu\n"
709                              "level: ptr %u header %llu\n"
710                              "min ptr %s node header %s\n"
711                              "max ptr %s node header %s",
712                              b->c.btree_id,     BTREE_NODE_ID(b->data),
713                              b->c.level,        BTREE_NODE_LEVEL(b->data),
714                              buf1, buf2, buf3, buf4);
715 }
716
717 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
718 {
719         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
720             b->c.level != BTREE_NODE_LEVEL(b->data) ||
721             bpos_cmp(b->data->max_key, b->key.k.p) ||
722             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
723              bpos_cmp(b->data->min_key,
724                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
725                 btree_bad_header(c, b);
726 }
727
728 /**
729  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
730  * in from disk if necessary.
731  *
732  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
733  *
734  * The btree node will have either a read or a write lock held, depending on
735  * the @write parameter.
736  */
737 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
738                                   const struct bkey_i *k, unsigned level,
739                                   enum six_lock_type lock_type,
740                                   unsigned long trace_ip)
741 {
742         struct btree_cache *bc = &c->btree_cache;
743         struct btree *b;
744         struct bset_tree *t;
745
746         EBUG_ON(level >= BTREE_MAX_DEPTH);
747
748         b = btree_node_mem_ptr(k);
749         if (b)
750                 goto lock_node;
751 retry:
752         b = btree_cache_find(bc, k);
753         if (unlikely(!b)) {
754                 /*
755                  * We must have the parent locked to call bch2_btree_node_fill(),
756                  * else we could read in a btree node from disk that's been
757                  * freed:
758                  */
759                 b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
760                                          level, lock_type, true);
761
762                 /* We raced and found the btree node in the cache */
763                 if (!b)
764                         goto retry;
765
766                 if (IS_ERR(b))
767                         return b;
768         } else {
769 lock_node:
770                 /*
771                  * There's a potential deadlock with splits and insertions into
772                  * interior nodes we have to avoid:
773                  *
774                  * The other thread might be holding an intent lock on the node
775                  * we want, and they want to update its parent node so they're
776                  * going to upgrade their intent lock on the parent node to a
777                  * write lock.
778                  *
779                  * But if we're holding a read lock on the parent, and we're
780                  * trying to get the intent lock they're holding, we deadlock.
781                  *
782                  * So to avoid this we drop the read locks on parent nodes when
783                  * we're starting to take intent locks - and handle the race.
784                  *
785                  * The race is that they might be about to free the node we
786                  * want, and dropping our read lock on the parent node lets them
787                  * update the parent marking the node we want as freed, and then
788                  * free it:
789                  *
790                  * To guard against this, btree nodes are evicted from the cache
791                  * when they're freed - and b->hash_val is zeroed out, which we
792                  * check for after we lock the node.
793                  *
794                  * Then, bch2_btree_node_relock() on the parent will fail - because
795                  * the parent was modified, when the pointer to the node we want
796                  * was removed - and we'll bail out:
797                  */
798                 if (btree_node_read_locked(iter, level + 1))
799                         btree_node_unlock(iter, level + 1);
800
801                 if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
802                                      lock_node_check_fn, (void *) k, trace_ip)) {
803                         if (b->hash_val != btree_ptr_hash_val(k))
804                                 goto retry;
805                         return ERR_PTR(-EINTR);
806                 }
807
808                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
809                              b->c.level != level ||
810                              race_fault())) {
811                         six_unlock_type(&b->c.lock, lock_type);
812                         if (bch2_btree_node_relock(iter, level + 1))
813                                 goto retry;
814
815                         trace_trans_restart_btree_node_reused(iter->trans->ip);
816                         return ERR_PTR(-EINTR);
817                 }
818         }
819
820         if (unlikely(btree_node_read_in_flight(b))) {
821                 six_unlock_type(&b->c.lock, lock_type);
822                 bch2_trans_unlock(iter->trans);
823
824                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
825                                TASK_UNINTERRUPTIBLE);
826
827                 /*
828                  * XXX: check if this always fails - btree_iter_relock()
829                  * currently fails for iterators that aren't pointed at a valid
830                  * btree node
831                  */
832                 if (iter && !bch2_trans_relock(iter->trans))
833                         return ERR_PTR(-EINTR);
834                 goto retry;
835         }
836
837         prefetch(b->aux_data);
838
839         for_each_bset(b, t) {
840                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
841
842                 prefetch(p + L1_CACHE_BYTES * 0);
843                 prefetch(p + L1_CACHE_BYTES * 1);
844                 prefetch(p + L1_CACHE_BYTES * 2);
845         }
846
847         /* avoid atomic set bit if it's not needed: */
848         if (!btree_node_accessed(b))
849                 set_btree_node_accessed(b);
850
851         if (unlikely(btree_node_read_error(b))) {
852                 six_unlock_type(&b->c.lock, lock_type);
853                 return ERR_PTR(-EIO);
854         }
855
856         EBUG_ON(b->c.btree_id != iter->btree_id);
857         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
858         btree_check_header(c, b);
859
860         return b;
861 }
862
863 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
864                                          const struct bkey_i *k,
865                                          enum btree_id btree_id,
866                                          unsigned level,
867                                          bool nofill)
868 {
869         struct btree_cache *bc = &c->btree_cache;
870         struct btree *b;
871         struct bset_tree *t;
872         int ret;
873
874         EBUG_ON(level >= BTREE_MAX_DEPTH);
875
876         b = btree_node_mem_ptr(k);
877         if (b)
878                 goto lock_node;
879 retry:
880         b = btree_cache_find(bc, k);
881         if (unlikely(!b)) {
882                 if (nofill)
883                         goto out;
884
885                 b = bch2_btree_node_fill(c, NULL, k, btree_id,
886                                          level, SIX_LOCK_read, true);
887
888                 /* We raced and found the btree node in the cache */
889                 if (!b)
890                         goto retry;
891
892                 if (IS_ERR(b) &&
893                     !bch2_btree_cache_cannibalize_lock(c, NULL))
894                         goto retry;
895
896                 if (IS_ERR(b))
897                         goto out;
898         } else {
899 lock_node:
900                 ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
901                 if (ret)
902                         goto retry;
903
904                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
905                              b->c.btree_id != btree_id ||
906                              b->c.level != level)) {
907                         six_unlock_read(&b->c.lock);
908                         goto retry;
909                 }
910         }
911
912         /* XXX: waiting on IO with btree locks held: */
913         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
914                        TASK_UNINTERRUPTIBLE);
915
916         prefetch(b->aux_data);
917
918         for_each_bset(b, t) {
919                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
920
921                 prefetch(p + L1_CACHE_BYTES * 0);
922                 prefetch(p + L1_CACHE_BYTES * 1);
923                 prefetch(p + L1_CACHE_BYTES * 2);
924         }
925
926         /* avoid atomic set bit if it's not needed: */
927         if (!btree_node_accessed(b))
928                 set_btree_node_accessed(b);
929
930         if (unlikely(btree_node_read_error(b))) {
931                 six_unlock_read(&b->c.lock);
932                 b = ERR_PTR(-EIO);
933                 goto out;
934         }
935
936         EBUG_ON(b->c.btree_id != btree_id);
937         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
938         btree_check_header(c, b);
939 out:
940         bch2_btree_cache_cannibalize_unlock(c);
941         return b;
942 }
943
944 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
945                               const struct bkey_i *k,
946                               enum btree_id btree_id, unsigned level)
947 {
948         struct btree_cache *bc = &c->btree_cache;
949         struct btree *b;
950
951         BUG_ON(iter && !btree_node_locked(iter, level + 1));
952         BUG_ON(level >= BTREE_MAX_DEPTH);
953
954         b = btree_cache_find(bc, k);
955         if (b)
956                 return;
957
958         bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
959 }
960
961 void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
962 {
963         struct btree_cache *bc = &c->btree_cache;
964         struct btree *b;
965
966         b = btree_cache_find(bc, k);
967         if (!b)
968                 return;
969
970         six_lock_intent(&b->c.lock, NULL, NULL);
971         six_lock_write(&b->c.lock, NULL, NULL);
972
973         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
974                        TASK_UNINTERRUPTIBLE);
975         __bch2_btree_node_write(c, b);
976
977         /* wait for any in flight btree write */
978         btree_node_wait_on_io(b);
979
980         BUG_ON(btree_node_dirty(b));
981
982         mutex_lock(&bc->lock);
983         btree_node_data_free(c, b);
984         bch2_btree_node_hash_remove(bc, b);
985         mutex_unlock(&bc->lock);
986
987         six_unlock_write(&b->c.lock);
988         six_unlock_intent(&b->c.lock);
989 }
990
991 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
992                              struct btree *b)
993 {
994         const struct bkey_format *f = &b->format;
995         struct bset_stats stats;
996
997         memset(&stats, 0, sizeof(stats));
998
999         bch2_btree_keys_stats(b, &stats);
1000
1001         pr_buf(out, "l %u ", b->c.level);
1002         bch2_bpos_to_text(out, b->data->min_key);
1003         pr_buf(out, " - ");
1004         bch2_bpos_to_text(out, b->data->max_key);
1005         pr_buf(out, ":\n"
1006                "    ptrs: ");
1007         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1008
1009         pr_buf(out, "\n"
1010                "    format: u64s %u fields %u %u %u %u %u\n"
1011                "    unpack fn len: %u\n"
1012                "    bytes used %zu/%zu (%zu%% full)\n"
1013                "    sib u64s: %u, %u (merge threshold %u)\n"
1014                "    nr packed keys %u\n"
1015                "    nr unpacked keys %u\n"
1016                "    floats %zu\n"
1017                "    failed unpacked %zu\n",
1018                f->key_u64s,
1019                f->bits_per_field[0],
1020                f->bits_per_field[1],
1021                f->bits_per_field[2],
1022                f->bits_per_field[3],
1023                f->bits_per_field[4],
1024                b->unpack_fn_len,
1025                b->nr.live_u64s * sizeof(u64),
1026                btree_bytes(c) - sizeof(struct btree_node),
1027                b->nr.live_u64s * 100 / btree_max_u64s(c),
1028                b->sib_u64s[0],
1029                b->sib_u64s[1],
1030                c->btree_foreground_merge_threshold,
1031                b->nr.packed_keys,
1032                b->nr.unpacked_keys,
1033                stats.floats,
1034                stats.failed);
1035 }
1036
1037 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
1038 {
1039         pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
1040         pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
1041         pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
1042 }