]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 69be0dae31 bcachefs: Always zero memory from bch2_trans_km...
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "error.h"
11
12 #include <linux/prefetch.h>
13 #include <linux/sched/mm.h>
14 #include <trace/events/bcachefs.h>
15
16 void bch2_recalc_btree_reserve(struct bch_fs *c)
17 {
18         unsigned i, reserve = 16;
19
20         if (!c->btree_roots[0].b)
21                 reserve += 8;
22
23         for (i = 0; i < BTREE_ID_NR; i++)
24                 if (c->btree_roots[i].b)
25                         reserve += min_t(unsigned, 1,
26                                          c->btree_roots[i].b->c.level) * 8;
27
28         c->btree_cache.reserve = reserve;
29 }
30
31 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
32 {
33         return max_t(int, 0, bc->used - bc->reserve);
34 }
35
36 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
37 {
38         struct btree_cache *bc = &c->btree_cache;
39
40         EBUG_ON(btree_node_write_in_flight(b));
41
42         kvpfree(b->data, btree_bytes(c));
43         b->data = NULL;
44 #ifdef __KERNEL__
45         vfree(b->aux_data);
46 #else
47         munmap(b->aux_data, btree_aux_data_bytes(b));
48 #endif
49         b->aux_data = NULL;
50
51         bc->used--;
52         list_move(&b->list, &bc->freed);
53 }
54
55 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
56                                    const void *obj)
57 {
58         const struct btree *b = obj;
59         const u64 *v = arg->key;
60
61         return b->hash_val == *v ? 0 : 1;
62 }
63
64 static const struct rhashtable_params bch_btree_cache_params = {
65         .head_offset    = offsetof(struct btree, hash),
66         .key_offset     = offsetof(struct btree, hash_val),
67         .key_len        = sizeof(u64),
68         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
69 };
70
71 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
72 {
73         BUG_ON(b->data || b->aux_data);
74
75         b->data = kvpmalloc(btree_bytes(c), gfp);
76         if (!b->data)
77                 return -ENOMEM;
78 #ifdef __KERNEL__
79         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
80 #else
81         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
82                            PROT_READ|PROT_WRITE|PROT_EXEC,
83                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
84 #endif
85         if (!b->aux_data) {
86                 kvpfree(b->data, btree_bytes(c));
87                 b->data = NULL;
88                 return -ENOMEM;
89         }
90
91         return 0;
92 }
93
94 static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
95 {
96         struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
97         if (!b)
98                 return NULL;
99
100         bkey_btree_ptr_init(&b->key);
101         six_lock_init(&b->c.lock);
102         INIT_LIST_HEAD(&b->list);
103         INIT_LIST_HEAD(&b->write_blocked);
104         b->byte_order = ilog2(btree_bytes(c));
105         return b;
106 }
107
108 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
109 {
110         struct btree_cache *bc = &c->btree_cache;
111         struct btree *b = __btree_node_mem_alloc(c);
112         if (!b)
113                 return NULL;
114
115         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
116                 kfree(b);
117                 return NULL;
118         }
119
120         bc->used++;
121         list_add(&b->list, &bc->freeable);
122         return b;
123 }
124
125 /* Btree in memory cache - hash table */
126
127 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
128 {
129         rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
130
131         /* Cause future lookups for this node to fail: */
132         b->hash_val = 0;
133
134         six_lock_wakeup_all(&b->c.lock);
135 }
136
137 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
138 {
139         BUG_ON(b->hash_val);
140         b->hash_val = btree_ptr_hash_val(&b->key);
141
142         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
143                                              bch_btree_cache_params);
144 }
145
146 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
147                                 unsigned level, enum btree_id id)
148 {
149         int ret;
150
151         b->c.level      = level;
152         b->c.btree_id   = id;
153
154         if (level)
155                 six_lock_pcpu_alloc(&b->c.lock);
156         else
157                 six_lock_pcpu_free_rcu(&b->c.lock);
158
159         mutex_lock(&bc->lock);
160         ret = __bch2_btree_node_hash_insert(bc, b);
161         if (!ret)
162                 list_add(&b->list, &bc->live);
163         mutex_unlock(&bc->lock);
164
165         return ret;
166 }
167
168 __flatten
169 static inline struct btree *btree_cache_find(struct btree_cache *bc,
170                                      const struct bkey_i *k)
171 {
172         u64 v = btree_ptr_hash_val(k);
173
174         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
175 }
176
177 /*
178  * this version is for btree nodes that have already been freed (we're not
179  * reaping a real btree node)
180  */
181 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
182 {
183         struct btree_cache *bc = &c->btree_cache;
184         int ret = 0;
185
186         lockdep_assert_held(&bc->lock);
187
188         if (!six_trylock_intent(&b->c.lock))
189                 return -ENOMEM;
190
191         if (!six_trylock_write(&b->c.lock))
192                 goto out_unlock_intent;
193
194         if (btree_node_noevict(b))
195                 goto out_unlock;
196
197         if (!btree_node_may_write(b))
198                 goto out_unlock;
199
200         if (btree_node_dirty(b) &&
201             test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
202                 goto out_unlock;
203
204         if (btree_node_dirty(b) ||
205             btree_node_write_in_flight(b) ||
206             btree_node_read_in_flight(b)) {
207                 if (!flush)
208                         goto out_unlock;
209
210                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
211                                TASK_UNINTERRUPTIBLE);
212
213                 /*
214                  * Using the underscore version because we don't want to compact
215                  * bsets after the write, since this node is about to be evicted
216                  * - unless btree verify mode is enabled, since it runs out of
217                  * the post write cleanup:
218                  */
219                 if (bch2_verify_btree_ondisk)
220                         bch2_btree_node_write(c, b, SIX_LOCK_intent);
221                 else
222                         __bch2_btree_node_write(c, b);
223
224                 /* wait for any in flight btree write */
225                 btree_node_wait_on_io(b);
226         }
227 out:
228         if (b->hash_val && !ret)
229                 trace_btree_node_reap(c, b);
230         return ret;
231 out_unlock:
232         six_unlock_write(&b->c.lock);
233 out_unlock_intent:
234         six_unlock_intent(&b->c.lock);
235         ret = -ENOMEM;
236         goto out;
237 }
238
239 static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
240 {
241         return __btree_node_reclaim(c, b, false);
242 }
243
244 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
245 {
246         return __btree_node_reclaim(c, b, true);
247 }
248
249 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
250                                            struct shrink_control *sc)
251 {
252         struct bch_fs *c = container_of(shrink, struct bch_fs,
253                                         btree_cache.shrink);
254         struct btree_cache *bc = &c->btree_cache;
255         struct btree *b, *t;
256         unsigned long nr = sc->nr_to_scan;
257         unsigned long can_free;
258         unsigned long touched = 0;
259         unsigned long freed = 0;
260         unsigned i, flags;
261
262         if (bch2_btree_shrinker_disabled)
263                 return SHRINK_STOP;
264
265         /* Return -1 if we can't do anything right now */
266         if (sc->gfp_mask & __GFP_FS)
267                 mutex_lock(&bc->lock);
268         else if (!mutex_trylock(&bc->lock))
269                 return -1;
270
271         flags = memalloc_nofs_save();
272
273         /*
274          * It's _really_ critical that we don't free too many btree nodes - we
275          * have to always leave ourselves a reserve. The reserve is how we
276          * guarantee that allocating memory for a new btree node can always
277          * succeed, so that inserting keys into the btree can always succeed and
278          * IO can always make forward progress:
279          */
280         nr /= btree_pages(c);
281         can_free = btree_cache_can_free(bc);
282         nr = min_t(unsigned long, nr, can_free);
283
284         i = 0;
285         list_for_each_entry_safe(b, t, &bc->freeable, list) {
286                 touched++;
287
288                 if (freed >= nr)
289                         break;
290
291                 if (++i > 3 &&
292                     !btree_node_reclaim(c, b)) {
293                         btree_node_data_free(c, b);
294                         six_unlock_write(&b->c.lock);
295                         six_unlock_intent(&b->c.lock);
296                         freed++;
297                 }
298         }
299 restart:
300         list_for_each_entry_safe(b, t, &bc->live, list) {
301                 touched++;
302
303                 if (freed >= nr) {
304                         /* Save position */
305                         if (&t->list != &bc->live)
306                                 list_move_tail(&bc->live, &t->list);
307                         break;
308                 }
309
310                 if (!btree_node_accessed(b) &&
311                     !btree_node_reclaim(c, b)) {
312                         /* can't call bch2_btree_node_hash_remove under lock  */
313                         freed++;
314                         if (&t->list != &bc->live)
315                                 list_move_tail(&bc->live, &t->list);
316
317                         btree_node_data_free(c, b);
318                         mutex_unlock(&bc->lock);
319
320                         bch2_btree_node_hash_remove(bc, b);
321                         six_unlock_write(&b->c.lock);
322                         six_unlock_intent(&b->c.lock);
323
324                         if (freed >= nr)
325                                 goto out;
326
327                         if (sc->gfp_mask & __GFP_FS)
328                                 mutex_lock(&bc->lock);
329                         else if (!mutex_trylock(&bc->lock))
330                                 goto out;
331                         goto restart;
332                 } else
333                         clear_btree_node_accessed(b);
334         }
335
336         mutex_unlock(&bc->lock);
337 out:
338         memalloc_nofs_restore(flags);
339         return (unsigned long) freed * btree_pages(c);
340 }
341
342 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
343                                             struct shrink_control *sc)
344 {
345         struct bch_fs *c = container_of(shrink, struct bch_fs,
346                                         btree_cache.shrink);
347         struct btree_cache *bc = &c->btree_cache;
348
349         if (bch2_btree_shrinker_disabled)
350                 return 0;
351
352         return btree_cache_can_free(bc) * btree_pages(c);
353 }
354
355 void bch2_fs_btree_cache_exit(struct bch_fs *c)
356 {
357         struct btree_cache *bc = &c->btree_cache;
358         struct btree *b;
359         unsigned i, flags;
360
361         if (bc->shrink.list.next)
362                 unregister_shrinker(&bc->shrink);
363
364         /* vfree() can allocate memory: */
365         flags = memalloc_nofs_save();
366         mutex_lock(&bc->lock);
367
368         if (c->verify_data)
369                 list_move(&c->verify_data->list, &bc->live);
370
371         kvpfree(c->verify_ondisk, btree_bytes(c));
372
373         for (i = 0; i < BTREE_ID_NR; i++)
374                 if (c->btree_roots[i].b)
375                         list_add(&c->btree_roots[i].b->list, &bc->live);
376
377         list_splice(&bc->freeable, &bc->live);
378
379         while (!list_empty(&bc->live)) {
380                 b = list_first_entry(&bc->live, struct btree, list);
381
382                 BUG_ON(btree_node_read_in_flight(b) ||
383                        btree_node_write_in_flight(b));
384
385                 if (btree_node_dirty(b))
386                         bch2_btree_complete_write(c, b, btree_current_write(b));
387                 clear_btree_node_dirty(c, b);
388
389                 btree_node_data_free(c, b);
390         }
391
392         BUG_ON(atomic_read(&c->btree_cache.dirty));
393
394         while (!list_empty(&bc->freed)) {
395                 b = list_first_entry(&bc->freed, struct btree, list);
396                 list_del(&b->list);
397                 six_lock_pcpu_free(&b->c.lock);
398                 kfree(b);
399         }
400
401         mutex_unlock(&bc->lock);
402         memalloc_nofs_restore(flags);
403
404         if (bc->table_init_done)
405                 rhashtable_destroy(&bc->table);
406 }
407
408 int bch2_fs_btree_cache_init(struct bch_fs *c)
409 {
410         struct btree_cache *bc = &c->btree_cache;
411         unsigned i;
412         int ret = 0;
413
414         pr_verbose_init(c->opts, "");
415
416         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
417         if (ret)
418                 goto out;
419
420         bc->table_init_done = true;
421
422         bch2_recalc_btree_reserve(c);
423
424         for (i = 0; i < bc->reserve; i++)
425                 if (!__bch2_btree_node_mem_alloc(c)) {
426                         ret = -ENOMEM;
427                         goto out;
428                 }
429
430         list_splice_init(&bc->live, &bc->freeable);
431
432         mutex_init(&c->verify_lock);
433
434         bc->shrink.count_objects        = bch2_btree_cache_count;
435         bc->shrink.scan_objects         = bch2_btree_cache_scan;
436         bc->shrink.seeks                = 4;
437         bc->shrink.batch                = btree_pages(c) * 2;
438         ret = register_shrinker(&bc->shrink);
439 out:
440         pr_verbose_init(c->opts, "ret %i", ret);
441         return ret;
442 }
443
444 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
445 {
446         mutex_init(&bc->lock);
447         INIT_LIST_HEAD(&bc->live);
448         INIT_LIST_HEAD(&bc->freeable);
449         INIT_LIST_HEAD(&bc->freed);
450 }
451
452 /*
453  * We can only have one thread cannibalizing other cached btree nodes at a time,
454  * or we'll deadlock. We use an open coded mutex to ensure that, which a
455  * cannibalize_bucket() will take. This means every time we unlock the root of
456  * the btree, we need to release this lock if we have it held.
457  */
458 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
459 {
460         struct btree_cache *bc = &c->btree_cache;
461
462         if (bc->alloc_lock == current) {
463                 trace_btree_node_cannibalize_unlock(c);
464                 bc->alloc_lock = NULL;
465                 closure_wake_up(&bc->alloc_wait);
466         }
467 }
468
469 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
470 {
471         struct btree_cache *bc = &c->btree_cache;
472         struct task_struct *old;
473
474         old = cmpxchg(&bc->alloc_lock, NULL, current);
475         if (old == NULL || old == current)
476                 goto success;
477
478         if (!cl) {
479                 trace_btree_node_cannibalize_lock_fail(c);
480                 return -ENOMEM;
481         }
482
483         closure_wait(&bc->alloc_wait, cl);
484
485         /* Try again, after adding ourselves to waitlist */
486         old = cmpxchg(&bc->alloc_lock, NULL, current);
487         if (old == NULL || old == current) {
488                 /* We raced */
489                 closure_wake_up(&bc->alloc_wait);
490                 goto success;
491         }
492
493         trace_btree_node_cannibalize_lock_fail(c);
494         return -EAGAIN;
495
496 success:
497         trace_btree_node_cannibalize_lock(c);
498         return 0;
499 }
500
501 static struct btree *btree_node_cannibalize(struct bch_fs *c)
502 {
503         struct btree_cache *bc = &c->btree_cache;
504         struct btree *b;
505
506         list_for_each_entry_reverse(b, &bc->live, list)
507                 if (!btree_node_reclaim(c, b))
508                         return b;
509
510         while (1) {
511                 list_for_each_entry_reverse(b, &bc->live, list)
512                         if (!btree_node_write_and_reclaim(c, b))
513                                 return b;
514
515                 /*
516                  * Rare case: all nodes were intent-locked.
517                  * Just busy-wait.
518                  */
519                 WARN_ONCE(1, "btree cache cannibalize failed\n");
520                 cond_resched();
521         }
522 }
523
524 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
525 {
526         struct btree_cache *bc = &c->btree_cache;
527         struct btree *b;
528         u64 start_time = local_clock();
529         unsigned flags;
530
531         flags = memalloc_nofs_save();
532         mutex_lock(&bc->lock);
533
534         /*
535          * btree_free() doesn't free memory; it sticks the node on the end of
536          * the list. Check if there's any freed nodes there:
537          */
538         list_for_each_entry(b, &bc->freeable, list)
539                 if (!btree_node_reclaim(c, b))
540                         goto got_node;
541
542         /*
543          * We never free struct btree itself, just the memory that holds the on
544          * disk node. Check the freed list before allocating a new one:
545          */
546         list_for_each_entry(b, &bc->freed, list)
547                 if (!btree_node_reclaim(c, b))
548                         goto got_node;
549
550         b = NULL;
551 got_node:
552         if (b)
553                 list_del_init(&b->list);
554         mutex_unlock(&bc->lock);
555
556         if (!b) {
557                 b = __btree_node_mem_alloc(c);
558                 if (!b)
559                         goto err;
560
561                 BUG_ON(!six_trylock_intent(&b->c.lock));
562                 BUG_ON(!six_trylock_write(&b->c.lock));
563         }
564
565         if (!b->data) {
566                 if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
567                         goto err;
568
569                 mutex_lock(&bc->lock);
570                 bc->used++;
571                 mutex_unlock(&bc->lock);
572         }
573
574         BUG_ON(btree_node_hashed(b));
575         BUG_ON(btree_node_write_in_flight(b));
576 out:
577         b->flags                = 0;
578         b->written              = 0;
579         b->nsets                = 0;
580         b->sib_u64s[0]          = 0;
581         b->sib_u64s[1]          = 0;
582         b->whiteout_u64s        = 0;
583         bch2_btree_keys_init(b);
584         set_btree_node_accessed(b);
585
586         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
587                                start_time);
588
589         memalloc_nofs_restore(flags);
590         return b;
591 err:
592         mutex_lock(&bc->lock);
593
594         if (b) {
595                 list_add(&b->list, &bc->freed);
596                 six_unlock_write(&b->c.lock);
597                 six_unlock_intent(&b->c.lock);
598         }
599
600         /* Try to cannibalize another cached btree node: */
601         if (bc->alloc_lock == current) {
602                 b = btree_node_cannibalize(c);
603                 list_del_init(&b->list);
604                 mutex_unlock(&bc->lock);
605
606                 bch2_btree_node_hash_remove(bc, b);
607
608                 trace_btree_node_cannibalize(c);
609                 goto out;
610         }
611
612         mutex_unlock(&bc->lock);
613         memalloc_nofs_restore(flags);
614         return ERR_PTR(-ENOMEM);
615 }
616
617 /* Slowpath, don't want it inlined into btree_iter_traverse() */
618 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
619                                 struct btree_iter *iter,
620                                 const struct bkey_i *k,
621                                 enum btree_id btree_id,
622                                 unsigned level,
623                                 enum six_lock_type lock_type,
624                                 bool sync)
625 {
626         struct btree_cache *bc = &c->btree_cache;
627         struct btree *b;
628
629         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
630         /*
631          * Parent node must be locked, else we could read in a btree node that's
632          * been freed:
633          */
634         if (iter && !bch2_btree_node_relock(iter, level + 1))
635                 return ERR_PTR(-EINTR);
636
637         b = bch2_btree_node_mem_alloc(c);
638         if (IS_ERR(b))
639                 return b;
640
641         bkey_copy(&b->key, k);
642         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
643                 /* raced with another fill: */
644
645                 /* mark as unhashed... */
646                 b->hash_val = 0;
647
648                 mutex_lock(&bc->lock);
649                 list_add(&b->list, &bc->freeable);
650                 mutex_unlock(&bc->lock);
651
652                 six_unlock_write(&b->c.lock);
653                 six_unlock_intent(&b->c.lock);
654                 return NULL;
655         }
656
657         /* Unlock before doing IO: */
658         if (iter && sync)
659                 bch2_trans_unlock(iter->trans);
660
661         bch2_btree_node_read(c, b, sync);
662
663         six_unlock_write(&b->c.lock);
664
665         if (!sync) {
666                 six_unlock_intent(&b->c.lock);
667                 return NULL;
668         }
669
670         /*
671          * XXX: this will probably always fail because btree_iter_relock()
672          * currently fails for iterators that aren't pointed at a valid btree
673          * node
674          */
675         if (iter && !bch2_trans_relock(iter->trans)) {
676                 six_unlock_intent(&b->c.lock);
677                 return ERR_PTR(-EINTR);
678         }
679
680         if (lock_type == SIX_LOCK_read)
681                 six_lock_downgrade(&b->c.lock);
682
683         return b;
684 }
685
686 static int lock_node_check_fn(struct six_lock *lock, void *p)
687 {
688         struct btree *b = container_of(lock, struct btree, c.lock);
689         const struct bkey_i *k = p;
690
691         return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
692 }
693
694 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
695 {
696         char buf1[100], buf2[100], buf3[100], buf4[100];
697
698         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
699                 return;
700
701         bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
702                 ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
703                 : POS_MIN);
704         bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
705
706         bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
707         bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
708         bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
709                              "btree: ptr %u header %llu\n"
710                              "level: ptr %u header %llu\n"
711                              "min ptr %s node header %s\n"
712                              "max ptr %s node header %s",
713                              b->c.btree_id,     BTREE_NODE_ID(b->data),
714                              b->c.level,        BTREE_NODE_LEVEL(b->data),
715                              buf1, buf2, buf3, buf4);
716 }
717
718 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
719 {
720         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
721             b->c.level != BTREE_NODE_LEVEL(b->data) ||
722             bpos_cmp(b->data->max_key, b->key.k.p) ||
723             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
724              bpos_cmp(b->data->min_key,
725                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
726                 btree_bad_header(c, b);
727 }
728
729 /**
730  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
731  * in from disk if necessary.
732  *
733  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
734  *
735  * The btree node will have either a read or a write lock held, depending on
736  * the @write parameter.
737  */
738 struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
739                                   const struct bkey_i *k, unsigned level,
740                                   enum six_lock_type lock_type,
741                                   unsigned long trace_ip)
742 {
743         struct btree_cache *bc = &c->btree_cache;
744         struct btree *b;
745         struct bset_tree *t;
746
747         EBUG_ON(level >= BTREE_MAX_DEPTH);
748
749         b = btree_node_mem_ptr(k);
750         if (b)
751                 goto lock_node;
752 retry:
753         b = btree_cache_find(bc, k);
754         if (unlikely(!b)) {
755                 /*
756                  * We must have the parent locked to call bch2_btree_node_fill(),
757                  * else we could read in a btree node from disk that's been
758                  * freed:
759                  */
760                 b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
761                                          level, lock_type, true);
762
763                 /* We raced and found the btree node in the cache */
764                 if (!b)
765                         goto retry;
766
767                 if (IS_ERR(b))
768                         return b;
769         } else {
770 lock_node:
771                 /*
772                  * There's a potential deadlock with splits and insertions into
773                  * interior nodes we have to avoid:
774                  *
775                  * The other thread might be holding an intent lock on the node
776                  * we want, and they want to update its parent node so they're
777                  * going to upgrade their intent lock on the parent node to a
778                  * write lock.
779                  *
780                  * But if we're holding a read lock on the parent, and we're
781                  * trying to get the intent lock they're holding, we deadlock.
782                  *
783                  * So to avoid this we drop the read locks on parent nodes when
784                  * we're starting to take intent locks - and handle the race.
785                  *
786                  * The race is that they might be about to free the node we
787                  * want, and dropping our read lock on the parent node lets them
788                  * update the parent marking the node we want as freed, and then
789                  * free it:
790                  *
791                  * To guard against this, btree nodes are evicted from the cache
792                  * when they're freed - and b->hash_val is zeroed out, which we
793                  * check for after we lock the node.
794                  *
795                  * Then, bch2_btree_node_relock() on the parent will fail - because
796                  * the parent was modified, when the pointer to the node we want
797                  * was removed - and we'll bail out:
798                  */
799                 if (btree_node_read_locked(iter, level + 1))
800                         btree_node_unlock(iter, level + 1);
801
802                 if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
803                                      lock_node_check_fn, (void *) k, trace_ip)) {
804                         if (b->hash_val != btree_ptr_hash_val(k))
805                                 goto retry;
806                         return ERR_PTR(-EINTR);
807                 }
808
809                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
810                              b->c.level != level ||
811                              race_fault())) {
812                         six_unlock_type(&b->c.lock, lock_type);
813                         if (bch2_btree_node_relock(iter, level + 1))
814                                 goto retry;
815
816                         trace_trans_restart_btree_node_reused(iter->trans->ip,
817                                                               trace_ip,
818                                                               iter->btree_id,
819                                                               &iter->real_pos);
820                         return ERR_PTR(-EINTR);
821                 }
822         }
823
824         if (unlikely(btree_node_read_in_flight(b))) {
825                 six_unlock_type(&b->c.lock, lock_type);
826                 bch2_trans_unlock(iter->trans);
827
828                 wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
829                                TASK_UNINTERRUPTIBLE);
830
831                 /*
832                  * XXX: check if this always fails - btree_iter_relock()
833                  * currently fails for iterators that aren't pointed at a valid
834                  * btree node
835                  */
836                 if (iter && !bch2_trans_relock(iter->trans))
837                         return ERR_PTR(-EINTR);
838                 goto retry;
839         }
840
841         prefetch(b->aux_data);
842
843         for_each_bset(b, t) {
844                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
845
846                 prefetch(p + L1_CACHE_BYTES * 0);
847                 prefetch(p + L1_CACHE_BYTES * 1);
848                 prefetch(p + L1_CACHE_BYTES * 2);
849         }
850
851         /* avoid atomic set bit if it's not needed: */
852         if (!btree_node_accessed(b))
853                 set_btree_node_accessed(b);
854
855         if (unlikely(btree_node_read_error(b))) {
856                 six_unlock_type(&b->c.lock, lock_type);
857                 return ERR_PTR(-EIO);
858         }
859
860         EBUG_ON(b->c.btree_id != iter->btree_id);
861         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
862         btree_check_header(c, b);
863
864         return b;
865 }
866
867 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
868                                          const struct bkey_i *k,
869                                          enum btree_id btree_id,
870                                          unsigned level,
871                                          bool nofill)
872 {
873         struct btree_cache *bc = &c->btree_cache;
874         struct btree *b;
875         struct bset_tree *t;
876         int ret;
877
878         EBUG_ON(level >= BTREE_MAX_DEPTH);
879
880         b = btree_node_mem_ptr(k);
881         if (b)
882                 goto lock_node;
883 retry:
884         b = btree_cache_find(bc, k);
885         if (unlikely(!b)) {
886                 if (nofill)
887                         goto out;
888
889                 b = bch2_btree_node_fill(c, NULL, k, btree_id,
890                                          level, SIX_LOCK_read, true);
891
892                 /* We raced and found the btree node in the cache */
893                 if (!b)
894                         goto retry;
895
896                 if (IS_ERR(b) &&
897                     !bch2_btree_cache_cannibalize_lock(c, NULL))
898                         goto retry;
899
900                 if (IS_ERR(b))
901                         goto out;
902         } else {
903 lock_node:
904                 ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
905                 if (ret)
906                         goto retry;
907
908                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
909                              b->c.btree_id != btree_id ||
910                              b->c.level != level)) {
911                         six_unlock_read(&b->c.lock);
912                         goto retry;
913                 }
914         }
915
916         /* XXX: waiting on IO with btree locks held: */
917         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
918                        TASK_UNINTERRUPTIBLE);
919
920         prefetch(b->aux_data);
921
922         for_each_bset(b, t) {
923                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
924
925                 prefetch(p + L1_CACHE_BYTES * 0);
926                 prefetch(p + L1_CACHE_BYTES * 1);
927                 prefetch(p + L1_CACHE_BYTES * 2);
928         }
929
930         /* avoid atomic set bit if it's not needed: */
931         if (!btree_node_accessed(b))
932                 set_btree_node_accessed(b);
933
934         if (unlikely(btree_node_read_error(b))) {
935                 six_unlock_read(&b->c.lock);
936                 b = ERR_PTR(-EIO);
937                 goto out;
938         }
939
940         EBUG_ON(b->c.btree_id != btree_id);
941         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
942         btree_check_header(c, b);
943 out:
944         bch2_btree_cache_cannibalize_unlock(c);
945         return b;
946 }
947
948 void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
949                               const struct bkey_i *k,
950                               enum btree_id btree_id, unsigned level)
951 {
952         struct btree_cache *bc = &c->btree_cache;
953         struct btree *b;
954
955         BUG_ON(iter && !btree_node_locked(iter, level + 1));
956         BUG_ON(level >= BTREE_MAX_DEPTH);
957
958         b = btree_cache_find(bc, k);
959         if (b)
960                 return;
961
962         bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
963 }
964
965 void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
966 {
967         struct btree_cache *bc = &c->btree_cache;
968         struct btree *b;
969
970         b = btree_cache_find(bc, k);
971         if (!b)
972                 return;
973
974         six_lock_intent(&b->c.lock, NULL, NULL);
975         six_lock_write(&b->c.lock, NULL, NULL);
976
977         wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
978                        TASK_UNINTERRUPTIBLE);
979         __bch2_btree_node_write(c, b);
980
981         /* wait for any in flight btree write */
982         btree_node_wait_on_io(b);
983
984         BUG_ON(btree_node_dirty(b));
985
986         mutex_lock(&bc->lock);
987         btree_node_data_free(c, b);
988         bch2_btree_node_hash_remove(bc, b);
989         mutex_unlock(&bc->lock);
990
991         six_unlock_write(&b->c.lock);
992         six_unlock_intent(&b->c.lock);
993 }
994
995 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
996                              struct btree *b)
997 {
998         const struct bkey_format *f = &b->format;
999         struct bset_stats stats;
1000
1001         memset(&stats, 0, sizeof(stats));
1002
1003         bch2_btree_keys_stats(b, &stats);
1004
1005         pr_buf(out, "l %u ", b->c.level);
1006         bch2_bpos_to_text(out, b->data->min_key);
1007         pr_buf(out, " - ");
1008         bch2_bpos_to_text(out, b->data->max_key);
1009         pr_buf(out, ":\n"
1010                "    ptrs: ");
1011         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1012
1013         pr_buf(out, "\n"
1014                "    format: u64s %u fields %u %u %u %u %u\n"
1015                "    unpack fn len: %u\n"
1016                "    bytes used %zu/%zu (%zu%% full)\n"
1017                "    sib u64s: %u, %u (merge threshold %u)\n"
1018                "    nr packed keys %u\n"
1019                "    nr unpacked keys %u\n"
1020                "    floats %zu\n"
1021                "    failed unpacked %zu\n",
1022                f->key_u64s,
1023                f->bits_per_field[0],
1024                f->bits_per_field[1],
1025                f->bits_per_field[2],
1026                f->bits_per_field[3],
1027                f->bits_per_field[4],
1028                b->unpack_fn_len,
1029                b->nr.live_u64s * sizeof(u64),
1030                btree_bytes(c) - sizeof(struct btree_node),
1031                b->nr.live_u64s * 100 / btree_max_u64s(c),
1032                b->sib_u64s[0],
1033                b->sib_u64s[1],
1034                c->btree_foreground_merge_threshold,
1035                b->nr.packed_keys,
1036                b->nr.unpacked_keys,
1037                stats.floats,
1038                stats.failed);
1039 }
1040
1041 void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
1042 {
1043         pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
1044         pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
1045         pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
1046 }