]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
8dd2db4121a6b57db2a2c20fa4c9f3e0ad781428
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
18 do {                                             \
19         if (shrinker_counter)                    \
20                 bc->not_freed_##counter++;       \
21 } while (0)
22
23 const char * const bch2_btree_node_flags[] = {
24 #define x(f)    #f,
25         BTREE_FLAGS()
26 #undef x
27         NULL
28 };
29
30 void bch2_recalc_btree_reserve(struct bch_fs *c)
31 {
32         unsigned i, reserve = 16;
33
34         if (!c->btree_roots[0].b)
35                 reserve += 8;
36
37         for (i = 0; i < BTREE_ID_NR; i++)
38                 if (c->btree_roots[i].b)
39                         reserve += min_t(unsigned, 1,
40                                          c->btree_roots[i].b->c.level) * 8;
41
42         c->btree_cache.reserve = reserve;
43 }
44
45 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
46 {
47         return max_t(int, 0, bc->used - bc->reserve);
48 }
49
50 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
51 {
52         if (b->c.lock.readers)
53                 list_move(&b->list, &bc->freed_pcpu);
54         else
55                 list_move(&b->list, &bc->freed_nonpcpu);
56 }
57
58 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
59 {
60         struct btree_cache *bc = &c->btree_cache;
61
62         EBUG_ON(btree_node_write_in_flight(b));
63
64         kvpfree(b->data, btree_bytes(c));
65         b->data = NULL;
66 #ifdef __KERNEL__
67         vfree(b->aux_data);
68 #else
69         munmap(b->aux_data, btree_aux_data_bytes(b));
70 #endif
71         b->aux_data = NULL;
72
73         bc->used--;
74
75         btree_node_to_freedlist(bc, b);
76 }
77
78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
79                                    const void *obj)
80 {
81         const struct btree *b = obj;
82         const u64 *v = arg->key;
83
84         return b->hash_val == *v ? 0 : 1;
85 }
86
87 static const struct rhashtable_params bch_btree_cache_params = {
88         .head_offset    = offsetof(struct btree, hash),
89         .key_offset     = offsetof(struct btree, hash_val),
90         .key_len        = sizeof(u64),
91         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
92 };
93
94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
95 {
96         BUG_ON(b->data || b->aux_data);
97
98         b->data = kvpmalloc(btree_bytes(c), gfp);
99         if (!b->data)
100                 return -ENOMEM;
101 #ifdef __KERNEL__
102         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
103 #else
104         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
105                            PROT_READ|PROT_WRITE|PROT_EXEC,
106                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
107         if (b->aux_data == MAP_FAILED)
108                 b->aux_data = NULL;
109 #endif
110         if (!b->aux_data) {
111                 kvpfree(b->data, btree_bytes(c));
112                 b->data = NULL;
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
120 {
121         struct btree *b = kzalloc(sizeof(struct btree), gfp);
122         if (!b)
123                 return NULL;
124
125         bkey_btree_ptr_init(&b->key);
126         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
127 #ifdef CONFIG_DEBUG_LOCK_ALLOC
128         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
129 #endif
130         INIT_LIST_HEAD(&b->list);
131         INIT_LIST_HEAD(&b->write_blocked);
132         b->byte_order = ilog2(btree_bytes(c));
133         return b;
134 }
135
136 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
137 {
138         struct btree_cache *bc = &c->btree_cache;
139         struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
140         if (!b)
141                 return NULL;
142
143         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
144                 kfree(b);
145                 return NULL;
146         }
147
148         bc->used++;
149         list_add(&b->list, &bc->freeable);
150         return b;
151 }
152
153 /* Btree in memory cache - hash table */
154
155 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
156 {
157         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
158         BUG_ON(ret);
159
160         /* Cause future lookups for this node to fail: */
161         b->hash_val = 0;
162 }
163
164 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
165 {
166         BUG_ON(b->hash_val);
167         b->hash_val = btree_ptr_hash_val(&b->key);
168
169         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
170                                              bch_btree_cache_params);
171 }
172
173 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
174                                 unsigned level, enum btree_id id)
175 {
176         int ret;
177
178         b->c.level      = level;
179         b->c.btree_id   = id;
180
181         mutex_lock(&bc->lock);
182         ret = __bch2_btree_node_hash_insert(bc, b);
183         if (!ret)
184                 list_add_tail(&b->list, &bc->live);
185         mutex_unlock(&bc->lock);
186
187         return ret;
188 }
189
190 __flatten
191 static inline struct btree *btree_cache_find(struct btree_cache *bc,
192                                      const struct bkey_i *k)
193 {
194         u64 v = btree_ptr_hash_val(k);
195
196         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
197 }
198
199 /*
200  * this version is for btree nodes that have already been freed (we're not
201  * reaping a real btree node)
202  */
203 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
204 {
205         struct btree_cache *bc = &c->btree_cache;
206         int ret = 0;
207
208         lockdep_assert_held(&bc->lock);
209 wait_on_io:
210         if (b->flags & ((1U << BTREE_NODE_dirty)|
211                         (1U << BTREE_NODE_read_in_flight)|
212                         (1U << BTREE_NODE_write_in_flight))) {
213                 if (!flush) {
214                         if (btree_node_dirty(b))
215                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
216                         else if (btree_node_read_in_flight(b))
217                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
218                         else if (btree_node_write_in_flight(b))
219                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
220                         return -ENOMEM;
221                 }
222
223                 /* XXX: waiting on IO with btree cache lock held */
224                 bch2_btree_node_wait_on_read(b);
225                 bch2_btree_node_wait_on_write(b);
226         }
227
228         if (!six_trylock_intent(&b->c.lock)) {
229                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
230                 return -ENOMEM;
231         }
232
233         if (!six_trylock_write(&b->c.lock)) {
234                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
235                 goto out_unlock_intent;
236         }
237
238         /* recheck under lock */
239         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
240                         (1U << BTREE_NODE_write_in_flight))) {
241                 if (!flush) {
242                         if (btree_node_read_in_flight(b))
243                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
244                         else if (btree_node_write_in_flight(b))
245                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
246                         goto out_unlock;
247                 }
248                 six_unlock_write(&b->c.lock);
249                 six_unlock_intent(&b->c.lock);
250                 goto wait_on_io;
251         }
252
253         if (btree_node_noevict(b)) {
254                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
255                 goto out_unlock;
256         }
257         if (btree_node_write_blocked(b)) {
258                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
259                 goto out_unlock;
260         }
261         if (btree_node_will_make_reachable(b)) {
262                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
263                 goto out_unlock;
264         }
265
266         if (btree_node_dirty(b)) {
267                 if (!flush) {
268                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
269                         goto out_unlock;
270                 }
271                 /*
272                  * Using the underscore version because we don't want to compact
273                  * bsets after the write, since this node is about to be evicted
274                  * - unless btree verify mode is enabled, since it runs out of
275                  * the post write cleanup:
276                  */
277                 if (bch2_verify_btree_ondisk)
278                         bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
279                 else
280                         __bch2_btree_node_write(c, b, 0);
281
282                 six_unlock_write(&b->c.lock);
283                 six_unlock_intent(&b->c.lock);
284                 goto wait_on_io;
285         }
286 out:
287         if (b->hash_val && !ret)
288                 trace_and_count(c, btree_cache_reap, c, b);
289         return ret;
290 out_unlock:
291         six_unlock_write(&b->c.lock);
292 out_unlock_intent:
293         six_unlock_intent(&b->c.lock);
294         ret = -ENOMEM;
295         goto out;
296 }
297
298 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
299 {
300         return __btree_node_reclaim(c, b, false, shrinker_counter);
301 }
302
303 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
304 {
305         return __btree_node_reclaim(c, b, true, false);
306 }
307
308 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
309                                            struct shrink_control *sc)
310 {
311         struct bch_fs *c = container_of(shrink, struct bch_fs,
312                                         btree_cache.shrink);
313         struct btree_cache *bc = &c->btree_cache;
314         struct btree *b, *t;
315         unsigned long nr = sc->nr_to_scan;
316         unsigned long can_free = 0;
317         unsigned long freed = 0;
318         unsigned long touched = 0;
319         unsigned i, flags;
320         unsigned long ret = SHRINK_STOP;
321         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
322                 bc->used * 3 / 4;
323
324         if (bch2_btree_shrinker_disabled)
325                 return SHRINK_STOP;
326
327         mutex_lock(&bc->lock);
328         flags = memalloc_nofs_save();
329
330         /*
331          * It's _really_ critical that we don't free too many btree nodes - we
332          * have to always leave ourselves a reserve. The reserve is how we
333          * guarantee that allocating memory for a new btree node can always
334          * succeed, so that inserting keys into the btree can always succeed and
335          * IO can always make forward progress:
336          */
337         can_free = btree_cache_can_free(bc);
338         nr = min_t(unsigned long, nr, can_free);
339
340         i = 0;
341         list_for_each_entry_safe(b, t, &bc->freeable, list) {
342                 /*
343                  * Leave a few nodes on the freeable list, so that a btree split
344                  * won't have to hit the system allocator:
345                  */
346                 if (++i <= 3)
347                         continue;
348
349                 touched++;
350
351                 if (touched >= nr)
352                         goto out;
353
354                 if (!btree_node_reclaim(c, b, true)) {
355                         btree_node_data_free(c, b);
356                         six_unlock_write(&b->c.lock);
357                         six_unlock_intent(&b->c.lock);
358                         freed++;
359                         bc->freed++;
360                 }
361         }
362 restart:
363         list_for_each_entry_safe(b, t, &bc->live, list) {
364                 touched++;
365
366                 if (btree_node_accessed(b)) {
367                         clear_btree_node_accessed(b);
368                         bc->not_freed_access_bit++;
369                 } else if (!btree_node_reclaim(c, b, true)) {
370                         freed++;
371                         btree_node_data_free(c, b);
372                         bc->freed++;
373
374                         bch2_btree_node_hash_remove(bc, b);
375                         six_unlock_write(&b->c.lock);
376                         six_unlock_intent(&b->c.lock);
377
378                         if (freed == nr)
379                                 goto out_rotate;
380                 } else if (trigger_writes &&
381                            btree_node_dirty(b) &&
382                            !btree_node_will_make_reachable(b) &&
383                            !btree_node_write_blocked(b) &&
384                            six_trylock_read(&b->c.lock)) {
385                         list_move(&bc->live, &b->list);
386                         mutex_unlock(&bc->lock);
387                         __bch2_btree_node_write(c, b, 0);
388                         six_unlock_read(&b->c.lock);
389                         if (touched >= nr)
390                                 goto out_nounlock;
391                         mutex_lock(&bc->lock);
392                         goto restart;
393                 }
394
395                 if (touched >= nr)
396                         break;
397         }
398 out_rotate:
399         if (&t->list != &bc->live)
400                 list_move_tail(&bc->live, &t->list);
401 out:
402         mutex_unlock(&bc->lock);
403 out_nounlock:
404         ret = freed;
405         memalloc_nofs_restore(flags);
406         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
407         return ret;
408 }
409
410 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
411                                             struct shrink_control *sc)
412 {
413         struct bch_fs *c = container_of(shrink, struct bch_fs,
414                                         btree_cache.shrink);
415         struct btree_cache *bc = &c->btree_cache;
416
417         if (bch2_btree_shrinker_disabled)
418                 return 0;
419
420         return btree_cache_can_free(bc);
421 }
422
423 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
424 {
425         struct bch_fs *c = container_of(shrink, struct bch_fs,
426                                         btree_cache.shrink);
427
428         bch2_btree_cache_to_text(out, &c->btree_cache);
429 }
430
431 void bch2_fs_btree_cache_exit(struct bch_fs *c)
432 {
433         struct btree_cache *bc = &c->btree_cache;
434         struct btree *b;
435         unsigned i, flags;
436
437         if (bc->shrink.list.next)
438                 unregister_shrinker(&bc->shrink);
439
440         /* vfree() can allocate memory: */
441         flags = memalloc_nofs_save();
442         mutex_lock(&bc->lock);
443
444         if (c->verify_data)
445                 list_move(&c->verify_data->list, &bc->live);
446
447         kvpfree(c->verify_ondisk, btree_bytes(c));
448
449         for (i = 0; i < BTREE_ID_NR; i++)
450                 if (c->btree_roots[i].b)
451                         list_add(&c->btree_roots[i].b->list, &bc->live);
452
453         list_splice(&bc->freeable, &bc->live);
454
455         while (!list_empty(&bc->live)) {
456                 b = list_first_entry(&bc->live, struct btree, list);
457
458                 BUG_ON(btree_node_read_in_flight(b) ||
459                        btree_node_write_in_flight(b));
460
461                 if (btree_node_dirty(b))
462                         bch2_btree_complete_write(c, b, btree_current_write(b));
463                 clear_btree_node_dirty_acct(c, b);
464
465                 btree_node_data_free(c, b);
466         }
467
468         BUG_ON(atomic_read(&c->btree_cache.dirty));
469
470         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
471
472         while (!list_empty(&bc->freed_nonpcpu)) {
473                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
474                 list_del(&b->list);
475                 six_lock_pcpu_free(&b->c.lock);
476                 kfree(b);
477         }
478
479         mutex_unlock(&bc->lock);
480         memalloc_nofs_restore(flags);
481
482         if (bc->table_init_done)
483                 rhashtable_destroy(&bc->table);
484 }
485
486 int bch2_fs_btree_cache_init(struct bch_fs *c)
487 {
488         struct btree_cache *bc = &c->btree_cache;
489         unsigned i;
490         int ret = 0;
491
492         pr_verbose_init(c->opts, "");
493
494         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
495         if (ret)
496                 goto out;
497
498         bc->table_init_done = true;
499
500         bch2_recalc_btree_reserve(c);
501
502         for (i = 0; i < bc->reserve; i++)
503                 if (!__bch2_btree_node_mem_alloc(c)) {
504                         ret = -ENOMEM;
505                         goto out;
506                 }
507
508         list_splice_init(&bc->live, &bc->freeable);
509
510         mutex_init(&c->verify_lock);
511
512         bc->shrink.count_objects        = bch2_btree_cache_count;
513         bc->shrink.scan_objects         = bch2_btree_cache_scan;
514         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
515         bc->shrink.seeks                = 4;
516         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
517 out:
518         pr_verbose_init(c->opts, "ret %i", ret);
519         return ret;
520 }
521
522 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
523 {
524         mutex_init(&bc->lock);
525         INIT_LIST_HEAD(&bc->live);
526         INIT_LIST_HEAD(&bc->freeable);
527         INIT_LIST_HEAD(&bc->freed_pcpu);
528         INIT_LIST_HEAD(&bc->freed_nonpcpu);
529 }
530
531 /*
532  * We can only have one thread cannibalizing other cached btree nodes at a time,
533  * or we'll deadlock. We use an open coded mutex to ensure that, which a
534  * cannibalize_bucket() will take. This means every time we unlock the root of
535  * the btree, we need to release this lock if we have it held.
536  */
537 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
538 {
539         struct btree_cache *bc = &c->btree_cache;
540
541         if (bc->alloc_lock == current) {
542                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
543                 bc->alloc_lock = NULL;
544                 closure_wake_up(&bc->alloc_wait);
545         }
546 }
547
548 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
549 {
550         struct btree_cache *bc = &c->btree_cache;
551         struct task_struct *old;
552
553         old = cmpxchg(&bc->alloc_lock, NULL, current);
554         if (old == NULL || old == current)
555                 goto success;
556
557         if (!cl) {
558                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
559                 return -ENOMEM;
560         }
561
562         closure_wait(&bc->alloc_wait, cl);
563
564         /* Try again, after adding ourselves to waitlist */
565         old = cmpxchg(&bc->alloc_lock, NULL, current);
566         if (old == NULL || old == current) {
567                 /* We raced */
568                 closure_wake_up(&bc->alloc_wait);
569                 goto success;
570         }
571
572         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
573         return -EAGAIN;
574
575 success:
576         trace_and_count(c, btree_cache_cannibalize_lock, c);
577         return 0;
578 }
579
580 static struct btree *btree_node_cannibalize(struct bch_fs *c)
581 {
582         struct btree_cache *bc = &c->btree_cache;
583         struct btree *b;
584
585         list_for_each_entry_reverse(b, &bc->live, list)
586                 if (!btree_node_reclaim(c, b, false))
587                         return b;
588
589         while (1) {
590                 list_for_each_entry_reverse(b, &bc->live, list)
591                         if (!btree_node_write_and_reclaim(c, b))
592                                 return b;
593
594                 /*
595                  * Rare case: all nodes were intent-locked.
596                  * Just busy-wait.
597                  */
598                 WARN_ONCE(1, "btree cache cannibalize failed\n");
599                 cond_resched();
600         }
601 }
602
603 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
604 {
605         struct btree_cache *bc = &c->btree_cache;
606         struct list_head *freed = pcpu_read_locks
607                 ? &bc->freed_pcpu
608                 : &bc->freed_nonpcpu;
609         struct btree *b, *b2;
610         u64 start_time = local_clock();
611         unsigned flags;
612
613         flags = memalloc_nofs_save();
614         mutex_lock(&bc->lock);
615
616         /*
617          * We never free struct btree itself, just the memory that holds the on
618          * disk node. Check the freed list before allocating a new one:
619          */
620         list_for_each_entry(b, freed, list)
621                 if (!btree_node_reclaim(c, b, false)) {
622                         list_del_init(&b->list);
623                         goto got_node;
624                 }
625
626         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
627         if (!b) {
628                 mutex_unlock(&bc->lock);
629                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
630                 if (!b)
631                         goto err;
632                 mutex_lock(&bc->lock);
633         }
634
635         if (pcpu_read_locks)
636                 six_lock_pcpu_alloc(&b->c.lock);
637
638         BUG_ON(!six_trylock_intent(&b->c.lock));
639         BUG_ON(!six_trylock_write(&b->c.lock));
640 got_node:
641
642         /*
643          * btree_free() doesn't free memory; it sticks the node on the end of
644          * the list. Check if there's any freed nodes there:
645          */
646         list_for_each_entry(b2, &bc->freeable, list)
647                 if (!btree_node_reclaim(c, b2, false)) {
648                         swap(b->data, b2->data);
649                         swap(b->aux_data, b2->aux_data);
650                         btree_node_to_freedlist(bc, b2);
651                         six_unlock_write(&b2->c.lock);
652                         six_unlock_intent(&b2->c.lock);
653                         goto got_mem;
654                 }
655
656         mutex_unlock(&bc->lock);
657
658         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
659                 goto err;
660
661         mutex_lock(&bc->lock);
662         bc->used++;
663 got_mem:
664         mutex_unlock(&bc->lock);
665
666         BUG_ON(btree_node_hashed(b));
667         BUG_ON(btree_node_dirty(b));
668         BUG_ON(btree_node_write_in_flight(b));
669 out:
670         b->flags                = 0;
671         b->written              = 0;
672         b->nsets                = 0;
673         b->sib_u64s[0]          = 0;
674         b->sib_u64s[1]          = 0;
675         b->whiteout_u64s        = 0;
676         bch2_btree_keys_init(b);
677         set_btree_node_accessed(b);
678
679         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
680                                start_time);
681
682         memalloc_nofs_restore(flags);
683         return b;
684 err:
685         mutex_lock(&bc->lock);
686
687         /* Try to cannibalize another cached btree node: */
688         if (bc->alloc_lock == current) {
689                 b2 = btree_node_cannibalize(c);
690                 bch2_btree_node_hash_remove(bc, b2);
691
692                 if (b) {
693                         swap(b->data, b2->data);
694                         swap(b->aux_data, b2->aux_data);
695                         btree_node_to_freedlist(bc, b2);
696                         six_unlock_write(&b2->c.lock);
697                         six_unlock_intent(&b2->c.lock);
698                 } else {
699                         b = b2;
700                         list_del_init(&b->list);
701                 }
702
703                 mutex_unlock(&bc->lock);
704
705                 trace_and_count(c, btree_cache_cannibalize, c);
706                 goto out;
707         }
708
709         mutex_unlock(&bc->lock);
710         memalloc_nofs_restore(flags);
711         return ERR_PTR(-ENOMEM);
712 }
713
714 /* Slowpath, don't want it inlined into btree_iter_traverse() */
715 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
716                                 struct btree_trans *trans,
717                                 struct btree_path *path,
718                                 const struct bkey_i *k,
719                                 enum btree_id btree_id,
720                                 unsigned level,
721                                 enum six_lock_type lock_type,
722                                 bool sync)
723 {
724         struct btree_cache *bc = &c->btree_cache;
725         struct btree *b;
726         u32 seq;
727
728         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
729         /*
730          * Parent node must be locked, else we could read in a btree node that's
731          * been freed:
732          */
733         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
734                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
735                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
736         }
737
738         b = bch2_btree_node_mem_alloc(c, level != 0);
739
740         if (trans && b == ERR_PTR(-ENOMEM)) {
741                 trans->memory_allocation_failure = true;
742                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
743                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
744         }
745
746         if (IS_ERR(b))
747                 return b;
748
749         bkey_copy(&b->key, k);
750         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
751                 /* raced with another fill: */
752
753                 /* mark as unhashed... */
754                 b->hash_val = 0;
755
756                 mutex_lock(&bc->lock);
757                 list_add(&b->list, &bc->freeable);
758                 mutex_unlock(&bc->lock);
759
760                 six_unlock_write(&b->c.lock);
761                 six_unlock_intent(&b->c.lock);
762                 return NULL;
763         }
764
765         set_btree_node_read_in_flight(b);
766
767         six_unlock_write(&b->c.lock);
768         seq = b->c.lock.state.seq;
769         six_unlock_intent(&b->c.lock);
770
771         /* Unlock before doing IO: */
772         if (trans && sync)
773                 bch2_trans_unlock(trans);
774
775         bch2_btree_node_read(c, b, sync);
776
777         if (!sync)
778                 return NULL;
779
780         if (trans) {
781                 int ret = bch2_trans_relock(trans) ?:
782                         bch2_btree_path_relock_intent(trans, path);
783                 if (ret) {
784                         BUG_ON(!trans->restarted);
785                         return ERR_PTR(ret);
786                 }
787         }
788
789         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
790                 if (trans)
791                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
792                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
793         }
794
795         return b;
796 }
797
798 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
799 {
800         struct printbuf buf = PRINTBUF;
801
802         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
803                 return;
804
805         prt_printf(&buf,
806                "btree node header doesn't match ptr\n"
807                "btree %s level %u\n"
808                "ptr: ",
809                bch2_btree_ids[b->c.btree_id], b->c.level);
810         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
811
812         prt_printf(&buf, "\nheader: btree %s level %llu\n"
813                "min ",
814                bch2_btree_ids[BTREE_NODE_ID(b->data)],
815                BTREE_NODE_LEVEL(b->data));
816         bch2_bpos_to_text(&buf, b->data->min_key);
817
818         prt_printf(&buf, "\nmax ");
819         bch2_bpos_to_text(&buf, b->data->max_key);
820
821         bch2_fs_inconsistent(c, "%s", buf.buf);
822         printbuf_exit(&buf);
823 }
824
825 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
826 {
827         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
828             b->c.level != BTREE_NODE_LEVEL(b->data) ||
829             bpos_cmp(b->data->max_key, b->key.k.p) ||
830             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
831              bpos_cmp(b->data->min_key,
832                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
833                 btree_bad_header(c, b);
834 }
835
836 /**
837  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
838  * in from disk if necessary.
839  *
840  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
841  *
842  * The btree node will have either a read or a write lock held, depending on
843  * the @write parameter.
844  */
845 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
846                                   const struct bkey_i *k, unsigned level,
847                                   enum six_lock_type lock_type,
848                                   unsigned long trace_ip)
849 {
850         struct bch_fs *c = trans->c;
851         struct btree_cache *bc = &c->btree_cache;
852         struct btree *b;
853         struct bset_tree *t;
854         int ret;
855
856         EBUG_ON(level >= BTREE_MAX_DEPTH);
857
858         b = btree_node_mem_ptr(k);
859
860         /*
861          * Check b->hash_val _before_ calling btree_node_lock() - this might not
862          * be the node we want anymore, and trying to lock the wrong node could
863          * cause an unneccessary transaction restart:
864          */
865         if (likely(c->opts.btree_node_mem_ptr_optimization &&
866                    b &&
867                    b->hash_val == btree_ptr_hash_val(k)))
868                 goto lock_node;
869 retry:
870         b = btree_cache_find(bc, k);
871         if (unlikely(!b)) {
872                 /*
873                  * We must have the parent locked to call bch2_btree_node_fill(),
874                  * else we could read in a btree node from disk that's been
875                  * freed:
876                  */
877                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
878                                          level, lock_type, true);
879
880                 /* We raced and found the btree node in the cache */
881                 if (!b)
882                         goto retry;
883
884                 if (IS_ERR(b))
885                         return b;
886         } else {
887 lock_node:
888                 /*
889                  * There's a potential deadlock with splits and insertions into
890                  * interior nodes we have to avoid:
891                  *
892                  * The other thread might be holding an intent lock on the node
893                  * we want, and they want to update its parent node so they're
894                  * going to upgrade their intent lock on the parent node to a
895                  * write lock.
896                  *
897                  * But if we're holding a read lock on the parent, and we're
898                  * trying to get the intent lock they're holding, we deadlock.
899                  *
900                  * So to avoid this we drop the read locks on parent nodes when
901                  * we're starting to take intent locks - and handle the race.
902                  *
903                  * The race is that they might be about to free the node we
904                  * want, and dropping our read lock on the parent node lets them
905                  * update the parent marking the node we want as freed, and then
906                  * free it:
907                  *
908                  * To guard against this, btree nodes are evicted from the cache
909                  * when they're freed - and b->hash_val is zeroed out, which we
910                  * check for after we lock the node.
911                  *
912                  * Then, bch2_btree_node_relock() on the parent will fail - because
913                  * the parent was modified, when the pointer to the node we want
914                  * was removed - and we'll bail out:
915                  */
916                 if (btree_node_read_locked(path, level + 1))
917                         btree_node_unlock(trans, path, level + 1);
918
919                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
920                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
921                         return ERR_PTR(ret);
922
923                 BUG_ON(ret);
924
925                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
926                              b->c.level != level ||
927                              race_fault())) {
928                         six_unlock_type(&b->c.lock, lock_type);
929                         if (bch2_btree_node_relock(trans, path, level + 1))
930                                 goto retry;
931
932                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
933                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
934                 }
935         }
936
937         if (unlikely(btree_node_read_in_flight(b))) {
938                 u32 seq = b->c.lock.state.seq;
939
940                 six_unlock_type(&b->c.lock, lock_type);
941                 bch2_trans_unlock(trans);
942
943                 bch2_btree_node_wait_on_read(b);
944
945                 /*
946                  * should_be_locked is not set on this path yet, so we need to
947                  * relock it specifically:
948                  */
949                 if (trans) {
950                         int ret = bch2_trans_relock(trans) ?:
951                                 bch2_btree_path_relock_intent(trans, path);
952                         if (ret) {
953                                 BUG_ON(!trans->restarted);
954                                 return ERR_PTR(ret);
955                         }
956                 }
957
958                 if (!six_relock_type(&b->c.lock, lock_type, seq))
959                         goto retry;
960         }
961
962         prefetch(b->aux_data);
963
964         for_each_bset(b, t) {
965                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
966
967                 prefetch(p + L1_CACHE_BYTES * 0);
968                 prefetch(p + L1_CACHE_BYTES * 1);
969                 prefetch(p + L1_CACHE_BYTES * 2);
970         }
971
972         /* avoid atomic set bit if it's not needed: */
973         if (!btree_node_accessed(b))
974                 set_btree_node_accessed(b);
975
976         if (unlikely(btree_node_read_error(b))) {
977                 six_unlock_type(&b->c.lock, lock_type);
978                 return ERR_PTR(-EIO);
979         }
980
981         EBUG_ON(b->c.btree_id != path->btree_id);
982         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
983         btree_check_header(c, b);
984
985         return b;
986 }
987
988 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
989                                          const struct bkey_i *k,
990                                          enum btree_id btree_id,
991                                          unsigned level,
992                                          bool nofill)
993 {
994         struct bch_fs *c = trans->c;
995         struct btree_cache *bc = &c->btree_cache;
996         struct btree *b;
997         struct bset_tree *t;
998         int ret;
999
1000         EBUG_ON(level >= BTREE_MAX_DEPTH);
1001
1002         if (c->opts.btree_node_mem_ptr_optimization) {
1003                 b = btree_node_mem_ptr(k);
1004                 if (b)
1005                         goto lock_node;
1006         }
1007 retry:
1008         b = btree_cache_find(bc, k);
1009         if (unlikely(!b)) {
1010                 if (nofill)
1011                         goto out;
1012
1013                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1014                                          level, SIX_LOCK_read, true);
1015
1016                 /* We raced and found the btree node in the cache */
1017                 if (!b)
1018                         goto retry;
1019
1020                 if (IS_ERR(b) &&
1021                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1022                         goto retry;
1023
1024                 if (IS_ERR(b))
1025                         goto out;
1026         } else {
1027 lock_node:
1028                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
1029                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1030                         return ERR_PTR(ret);
1031
1032                 BUG_ON(ret);
1033
1034                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1035                              b->c.btree_id != btree_id ||
1036                              b->c.level != level)) {
1037                         six_unlock_read(&b->c.lock);
1038                         goto retry;
1039                 }
1040         }
1041
1042         /* XXX: waiting on IO with btree locks held: */
1043         __bch2_btree_node_wait_on_read(b);
1044
1045         prefetch(b->aux_data);
1046
1047         for_each_bset(b, t) {
1048                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1049
1050                 prefetch(p + L1_CACHE_BYTES * 0);
1051                 prefetch(p + L1_CACHE_BYTES * 1);
1052                 prefetch(p + L1_CACHE_BYTES * 2);
1053         }
1054
1055         /* avoid atomic set bit if it's not needed: */
1056         if (!btree_node_accessed(b))
1057                 set_btree_node_accessed(b);
1058
1059         if (unlikely(btree_node_read_error(b))) {
1060                 six_unlock_read(&b->c.lock);
1061                 b = ERR_PTR(-EIO);
1062                 goto out;
1063         }
1064
1065         EBUG_ON(b->c.btree_id != btree_id);
1066         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1067         btree_check_header(c, b);
1068 out:
1069         bch2_btree_cache_cannibalize_unlock(c);
1070         return b;
1071 }
1072
1073 int bch2_btree_node_prefetch(struct bch_fs *c,
1074                              struct btree_trans *trans,
1075                              struct btree_path *path,
1076                              const struct bkey_i *k,
1077                              enum btree_id btree_id, unsigned level)
1078 {
1079         struct btree_cache *bc = &c->btree_cache;
1080         struct btree *b;
1081
1082         BUG_ON(trans && !btree_node_locked(path, level + 1));
1083         BUG_ON(level >= BTREE_MAX_DEPTH);
1084
1085         b = btree_cache_find(bc, k);
1086         if (b)
1087                 return 0;
1088
1089         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1090                                  level, SIX_LOCK_read, false);
1091         return PTR_ERR_OR_ZERO(b);
1092 }
1093
1094 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1095 {
1096         struct bch_fs *c = trans->c;
1097         struct btree_cache *bc = &c->btree_cache;
1098         struct btree *b;
1099
1100         b = btree_cache_find(bc, k);
1101         if (!b)
1102                 return;
1103 wait_on_io:
1104         /* not allowed to wait on io with btree locks held: */
1105
1106         /* XXX we're called from btree_gc which will be holding other btree
1107          * nodes locked
1108          */
1109         __bch2_btree_node_wait_on_read(b);
1110         __bch2_btree_node_wait_on_write(b);
1111
1112         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1113         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1114
1115         if (btree_node_dirty(b)) {
1116                 __bch2_btree_node_write(c, b, 0);
1117                 six_unlock_write(&b->c.lock);
1118                 six_unlock_intent(&b->c.lock);
1119                 goto wait_on_io;
1120         }
1121
1122         BUG_ON(btree_node_dirty(b));
1123
1124         mutex_lock(&bc->lock);
1125         btree_node_data_free(c, b);
1126         bch2_btree_node_hash_remove(bc, b);
1127         mutex_unlock(&bc->lock);
1128
1129         six_unlock_write(&b->c.lock);
1130         six_unlock_intent(&b->c.lock);
1131 }
1132
1133 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1134                              struct btree *b)
1135 {
1136         const struct bkey_format *f = &b->format;
1137         struct bset_stats stats;
1138
1139         memset(&stats, 0, sizeof(stats));
1140
1141         bch2_btree_keys_stats(b, &stats);
1142
1143         prt_printf(out, "l %u ", b->c.level);
1144         bch2_bpos_to_text(out, b->data->min_key);
1145         prt_printf(out, " - ");
1146         bch2_bpos_to_text(out, b->data->max_key);
1147         prt_printf(out, ":\n"
1148                "    ptrs: ");
1149         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1150
1151         prt_printf(out, "\n"
1152                "    format: u64s %u fields %u %u %u %u %u\n"
1153                "    unpack fn len: %u\n"
1154                "    bytes used %zu/%zu (%zu%% full)\n"
1155                "    sib u64s: %u, %u (merge threshold %u)\n"
1156                "    nr packed keys %u\n"
1157                "    nr unpacked keys %u\n"
1158                "    floats %zu\n"
1159                "    failed unpacked %zu\n",
1160                f->key_u64s,
1161                f->bits_per_field[0],
1162                f->bits_per_field[1],
1163                f->bits_per_field[2],
1164                f->bits_per_field[3],
1165                f->bits_per_field[4],
1166                b->unpack_fn_len,
1167                b->nr.live_u64s * sizeof(u64),
1168                btree_bytes(c) - sizeof(struct btree_node),
1169                b->nr.live_u64s * 100 / btree_max_u64s(c),
1170                b->sib_u64s[0],
1171                b->sib_u64s[1],
1172                c->btree_foreground_merge_threshold,
1173                b->nr.packed_keys,
1174                b->nr.unpacked_keys,
1175                stats.floats,
1176                stats.failed);
1177 }
1178
1179 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1180 {
1181         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1182         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1183         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1184
1185         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1186         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1187         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1188         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1189         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1190         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1191         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1192         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1193         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1194         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1195
1196 }