]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
d24827fb0164b37663f76e1fadd36e03b8721149
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
18 do {                                             \
19         if (shrinker_counter)                    \
20                 bc->not_freed_##counter++;       \
21 } while (0)
22
23 const char * const bch2_btree_node_flags[] = {
24 #define x(f)    #f,
25         BTREE_FLAGS()
26 #undef x
27         NULL
28 };
29
30 void bch2_recalc_btree_reserve(struct bch_fs *c)
31 {
32         unsigned i, reserve = 16;
33
34         if (!c->btree_roots[0].b)
35                 reserve += 8;
36
37         for (i = 0; i < BTREE_ID_NR; i++)
38                 if (c->btree_roots[i].b)
39                         reserve += min_t(unsigned, 1,
40                                          c->btree_roots[i].b->c.level) * 8;
41
42         c->btree_cache.reserve = reserve;
43 }
44
45 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
46 {
47         return max_t(int, 0, bc->used - bc->reserve);
48 }
49
50 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
51 {
52         if (b->c.lock.readers)
53                 list_move(&b->list, &bc->freed_pcpu);
54         else
55                 list_move(&b->list, &bc->freed_nonpcpu);
56 }
57
58 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
59 {
60         struct btree_cache *bc = &c->btree_cache;
61
62         EBUG_ON(btree_node_write_in_flight(b));
63
64         kvpfree(b->data, btree_bytes(c));
65         b->data = NULL;
66 #ifdef __KERNEL__
67         vfree(b->aux_data);
68 #else
69         munmap(b->aux_data, btree_aux_data_bytes(b));
70 #endif
71         b->aux_data = NULL;
72
73         bc->used--;
74
75         btree_node_to_freedlist(bc, b);
76 }
77
78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
79                                    const void *obj)
80 {
81         const struct btree *b = obj;
82         const u64 *v = arg->key;
83
84         return b->hash_val == *v ? 0 : 1;
85 }
86
87 static const struct rhashtable_params bch_btree_cache_params = {
88         .head_offset    = offsetof(struct btree, hash),
89         .key_offset     = offsetof(struct btree, hash_val),
90         .key_len        = sizeof(u64),
91         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
92 };
93
94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
95 {
96         BUG_ON(b->data || b->aux_data);
97
98         b->data = kvpmalloc(btree_bytes(c), gfp);
99         if (!b->data)
100                 return -ENOMEM;
101 #ifdef __KERNEL__
102         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
103 #else
104         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
105                            PROT_READ|PROT_WRITE|PROT_EXEC,
106                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
107         if (b->aux_data == MAP_FAILED)
108                 b->aux_data = NULL;
109 #endif
110         if (!b->aux_data) {
111                 kvpfree(b->data, btree_bytes(c));
112                 b->data = NULL;
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
120 {
121         struct btree *b;
122
123         b = kzalloc(sizeof(struct btree), gfp);
124         if (!b)
125                 return NULL;
126
127         bkey_btree_ptr_init(&b->key);
128         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
129 #ifdef CONFIG_DEBUG_LOCK_ALLOC
130         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
131 #endif
132         INIT_LIST_HEAD(&b->list);
133         INIT_LIST_HEAD(&b->write_blocked);
134         b->byte_order = ilog2(btree_bytes(c));
135         return b;
136 }
137
138 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
139 {
140         struct btree_cache *bc = &c->btree_cache;
141         struct btree *b;
142
143         b = __btree_node_mem_alloc(c, GFP_KERNEL);
144         if (!b)
145                 return NULL;
146
147         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
148                 kfree(b);
149                 return NULL;
150         }
151
152         bc->used++;
153         list_add(&b->list, &bc->freeable);
154         return b;
155 }
156
157 /* Btree in memory cache - hash table */
158
159 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
160 {
161         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
162
163         BUG_ON(ret);
164
165         /* Cause future lookups for this node to fail: */
166         b->hash_val = 0;
167 }
168
169 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
170 {
171         BUG_ON(b->hash_val);
172         b->hash_val = btree_ptr_hash_val(&b->key);
173
174         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
175                                              bch_btree_cache_params);
176 }
177
178 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
179                                 unsigned level, enum btree_id id)
180 {
181         int ret;
182
183         b->c.level      = level;
184         b->c.btree_id   = id;
185
186         mutex_lock(&bc->lock);
187         ret = __bch2_btree_node_hash_insert(bc, b);
188         if (!ret)
189                 list_add_tail(&b->list, &bc->live);
190         mutex_unlock(&bc->lock);
191
192         return ret;
193 }
194
195 __flatten
196 static inline struct btree *btree_cache_find(struct btree_cache *bc,
197                                      const struct bkey_i *k)
198 {
199         u64 v = btree_ptr_hash_val(k);
200
201         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
202 }
203
204 /*
205  * this version is for btree nodes that have already been freed (we're not
206  * reaping a real btree node)
207  */
208 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
209 {
210         struct btree_cache *bc = &c->btree_cache;
211         int ret = 0;
212
213         lockdep_assert_held(&bc->lock);
214 wait_on_io:
215         if (b->flags & ((1U << BTREE_NODE_dirty)|
216                         (1U << BTREE_NODE_read_in_flight)|
217                         (1U << BTREE_NODE_write_in_flight))) {
218                 if (!flush) {
219                         if (btree_node_dirty(b))
220                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
221                         else if (btree_node_read_in_flight(b))
222                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
223                         else if (btree_node_write_in_flight(b))
224                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
225                         return -ENOMEM;
226                 }
227
228                 /* XXX: waiting on IO with btree cache lock held */
229                 bch2_btree_node_wait_on_read(b);
230                 bch2_btree_node_wait_on_write(b);
231         }
232
233         if (!six_trylock_intent(&b->c.lock)) {
234                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
235                 return -ENOMEM;
236         }
237
238         if (!six_trylock_write(&b->c.lock)) {
239                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
240                 goto out_unlock_intent;
241         }
242
243         /* recheck under lock */
244         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
245                         (1U << BTREE_NODE_write_in_flight))) {
246                 if (!flush) {
247                         if (btree_node_read_in_flight(b))
248                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
249                         else if (btree_node_write_in_flight(b))
250                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
251                         goto out_unlock;
252                 }
253                 six_unlock_write(&b->c.lock);
254                 six_unlock_intent(&b->c.lock);
255                 goto wait_on_io;
256         }
257
258         if (btree_node_noevict(b)) {
259                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
260                 goto out_unlock;
261         }
262         if (btree_node_write_blocked(b)) {
263                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
264                 goto out_unlock;
265         }
266         if (btree_node_will_make_reachable(b)) {
267                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
268                 goto out_unlock;
269         }
270
271         if (btree_node_dirty(b)) {
272                 if (!flush) {
273                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
274                         goto out_unlock;
275                 }
276                 /*
277                  * Using the underscore version because we don't want to compact
278                  * bsets after the write, since this node is about to be evicted
279                  * - unless btree verify mode is enabled, since it runs out of
280                  * the post write cleanup:
281                  */
282                 if (bch2_verify_btree_ondisk)
283                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
284                                               BTREE_WRITE_cache_reclaim);
285                 else
286                         __bch2_btree_node_write(c, b,
287                                                 BTREE_WRITE_cache_reclaim);
288
289                 six_unlock_write(&b->c.lock);
290                 six_unlock_intent(&b->c.lock);
291                 goto wait_on_io;
292         }
293 out:
294         if (b->hash_val && !ret)
295                 trace_and_count(c, btree_cache_reap, c, b);
296         return ret;
297 out_unlock:
298         six_unlock_write(&b->c.lock);
299 out_unlock_intent:
300         six_unlock_intent(&b->c.lock);
301         ret = -ENOMEM;
302         goto out;
303 }
304
305 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
306 {
307         return __btree_node_reclaim(c, b, false, shrinker_counter);
308 }
309
310 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
311 {
312         return __btree_node_reclaim(c, b, true, false);
313 }
314
315 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
316                                            struct shrink_control *sc)
317 {
318         struct bch_fs *c = container_of(shrink, struct bch_fs,
319                                         btree_cache.shrink);
320         struct btree_cache *bc = &c->btree_cache;
321         struct btree *b, *t;
322         unsigned long nr = sc->nr_to_scan;
323         unsigned long can_free = 0;
324         unsigned long freed = 0;
325         unsigned long touched = 0;
326         unsigned i, flags;
327         unsigned long ret = SHRINK_STOP;
328         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
329                 bc->used * 3 / 4;
330
331         if (bch2_btree_shrinker_disabled)
332                 return SHRINK_STOP;
333
334         mutex_lock(&bc->lock);
335         flags = memalloc_nofs_save();
336
337         /*
338          * It's _really_ critical that we don't free too many btree nodes - we
339          * have to always leave ourselves a reserve. The reserve is how we
340          * guarantee that allocating memory for a new btree node can always
341          * succeed, so that inserting keys into the btree can always succeed and
342          * IO can always make forward progress:
343          */
344         can_free = btree_cache_can_free(bc);
345         nr = min_t(unsigned long, nr, can_free);
346
347         i = 0;
348         list_for_each_entry_safe(b, t, &bc->freeable, list) {
349                 /*
350                  * Leave a few nodes on the freeable list, so that a btree split
351                  * won't have to hit the system allocator:
352                  */
353                 if (++i <= 3)
354                         continue;
355
356                 touched++;
357
358                 if (touched >= nr)
359                         goto out;
360
361                 if (!btree_node_reclaim(c, b, true)) {
362                         btree_node_data_free(c, b);
363                         six_unlock_write(&b->c.lock);
364                         six_unlock_intent(&b->c.lock);
365                         freed++;
366                         bc->freed++;
367                 }
368         }
369 restart:
370         list_for_each_entry_safe(b, t, &bc->live, list) {
371                 touched++;
372
373                 if (btree_node_accessed(b)) {
374                         clear_btree_node_accessed(b);
375                         bc->not_freed_access_bit++;
376                 } else if (!btree_node_reclaim(c, b, true)) {
377                         freed++;
378                         btree_node_data_free(c, b);
379                         bc->freed++;
380
381                         bch2_btree_node_hash_remove(bc, b);
382                         six_unlock_write(&b->c.lock);
383                         six_unlock_intent(&b->c.lock);
384
385                         if (freed == nr)
386                                 goto out_rotate;
387                 } else if (trigger_writes &&
388                            btree_node_dirty(b) &&
389                            !btree_node_will_make_reachable(b) &&
390                            !btree_node_write_blocked(b) &&
391                            six_trylock_read(&b->c.lock)) {
392                         list_move(&bc->live, &b->list);
393                         mutex_unlock(&bc->lock);
394                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
395                         six_unlock_read(&b->c.lock);
396                         if (touched >= nr)
397                                 goto out_nounlock;
398                         mutex_lock(&bc->lock);
399                         goto restart;
400                 }
401
402                 if (touched >= nr)
403                         break;
404         }
405 out_rotate:
406         if (&t->list != &bc->live)
407                 list_move_tail(&bc->live, &t->list);
408 out:
409         mutex_unlock(&bc->lock);
410 out_nounlock:
411         ret = freed;
412         memalloc_nofs_restore(flags);
413         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
414         return ret;
415 }
416
417 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
418                                             struct shrink_control *sc)
419 {
420         struct bch_fs *c = container_of(shrink, struct bch_fs,
421                                         btree_cache.shrink);
422         struct btree_cache *bc = &c->btree_cache;
423
424         if (bch2_btree_shrinker_disabled)
425                 return 0;
426
427         return btree_cache_can_free(bc);
428 }
429
430 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
431 {
432         struct bch_fs *c = container_of(shrink, struct bch_fs,
433                                         btree_cache.shrink);
434
435         bch2_btree_cache_to_text(out, &c->btree_cache);
436 }
437
438 void bch2_fs_btree_cache_exit(struct bch_fs *c)
439 {
440         struct btree_cache *bc = &c->btree_cache;
441         struct btree *b;
442         unsigned i, flags;
443
444         if (bc->shrink.list.next)
445                 unregister_shrinker(&bc->shrink);
446
447         /* vfree() can allocate memory: */
448         flags = memalloc_nofs_save();
449         mutex_lock(&bc->lock);
450
451         if (c->verify_data)
452                 list_move(&c->verify_data->list, &bc->live);
453
454         kvpfree(c->verify_ondisk, btree_bytes(c));
455
456         for (i = 0; i < BTREE_ID_NR; i++)
457                 if (c->btree_roots[i].b)
458                         list_add(&c->btree_roots[i].b->list, &bc->live);
459
460         list_splice(&bc->freeable, &bc->live);
461
462         while (!list_empty(&bc->live)) {
463                 b = list_first_entry(&bc->live, struct btree, list);
464
465                 BUG_ON(btree_node_read_in_flight(b) ||
466                        btree_node_write_in_flight(b));
467
468                 if (btree_node_dirty(b))
469                         bch2_btree_complete_write(c, b, btree_current_write(b));
470                 clear_btree_node_dirty_acct(c, b);
471
472                 btree_node_data_free(c, b);
473         }
474
475         BUG_ON(atomic_read(&c->btree_cache.dirty));
476
477         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
478
479         while (!list_empty(&bc->freed_nonpcpu)) {
480                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
481                 list_del(&b->list);
482                 six_lock_pcpu_free(&b->c.lock);
483                 kfree(b);
484         }
485
486         mutex_unlock(&bc->lock);
487         memalloc_nofs_restore(flags);
488
489         if (bc->table_init_done)
490                 rhashtable_destroy(&bc->table);
491 }
492
493 int bch2_fs_btree_cache_init(struct bch_fs *c)
494 {
495         struct btree_cache *bc = &c->btree_cache;
496         unsigned i;
497         int ret = 0;
498
499         pr_verbose_init(c->opts, "");
500
501         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
502         if (ret)
503                 goto out;
504
505         bc->table_init_done = true;
506
507         bch2_recalc_btree_reserve(c);
508
509         for (i = 0; i < bc->reserve; i++)
510                 if (!__bch2_btree_node_mem_alloc(c)) {
511                         ret = -ENOMEM;
512                         goto out;
513                 }
514
515         list_splice_init(&bc->live, &bc->freeable);
516
517         mutex_init(&c->verify_lock);
518
519         bc->shrink.count_objects        = bch2_btree_cache_count;
520         bc->shrink.scan_objects         = bch2_btree_cache_scan;
521         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
522         bc->shrink.seeks                = 4;
523         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
524 out:
525         pr_verbose_init(c->opts, "ret %i", ret);
526         return ret;
527 }
528
529 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
530 {
531         mutex_init(&bc->lock);
532         INIT_LIST_HEAD(&bc->live);
533         INIT_LIST_HEAD(&bc->freeable);
534         INIT_LIST_HEAD(&bc->freed_pcpu);
535         INIT_LIST_HEAD(&bc->freed_nonpcpu);
536 }
537
538 /*
539  * We can only have one thread cannibalizing other cached btree nodes at a time,
540  * or we'll deadlock. We use an open coded mutex to ensure that, which a
541  * cannibalize_bucket() will take. This means every time we unlock the root of
542  * the btree, we need to release this lock if we have it held.
543  */
544 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
545 {
546         struct btree_cache *bc = &c->btree_cache;
547
548         if (bc->alloc_lock == current) {
549                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
550                 bc->alloc_lock = NULL;
551                 closure_wake_up(&bc->alloc_wait);
552         }
553 }
554
555 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
556 {
557         struct btree_cache *bc = &c->btree_cache;
558         struct task_struct *old;
559
560         old = cmpxchg(&bc->alloc_lock, NULL, current);
561         if (old == NULL || old == current)
562                 goto success;
563
564         if (!cl) {
565                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
566                 return -ENOMEM;
567         }
568
569         closure_wait(&bc->alloc_wait, cl);
570
571         /* Try again, after adding ourselves to waitlist */
572         old = cmpxchg(&bc->alloc_lock, NULL, current);
573         if (old == NULL || old == current) {
574                 /* We raced */
575                 closure_wake_up(&bc->alloc_wait);
576                 goto success;
577         }
578
579         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
580         return -EAGAIN;
581
582 success:
583         trace_and_count(c, btree_cache_cannibalize_lock, c);
584         return 0;
585 }
586
587 static struct btree *btree_node_cannibalize(struct bch_fs *c)
588 {
589         struct btree_cache *bc = &c->btree_cache;
590         struct btree *b;
591
592         list_for_each_entry_reverse(b, &bc->live, list)
593                 if (!btree_node_reclaim(c, b, false))
594                         return b;
595
596         while (1) {
597                 list_for_each_entry_reverse(b, &bc->live, list)
598                         if (!btree_node_write_and_reclaim(c, b))
599                                 return b;
600
601                 /*
602                  * Rare case: all nodes were intent-locked.
603                  * Just busy-wait.
604                  */
605                 WARN_ONCE(1, "btree cache cannibalize failed\n");
606                 cond_resched();
607         }
608 }
609
610 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
611 {
612         struct btree_cache *bc = &c->btree_cache;
613         struct list_head *freed = pcpu_read_locks
614                 ? &bc->freed_pcpu
615                 : &bc->freed_nonpcpu;
616         struct btree *b, *b2;
617         u64 start_time = local_clock();
618         unsigned flags;
619
620         flags = memalloc_nofs_save();
621         mutex_lock(&bc->lock);
622
623         /*
624          * We never free struct btree itself, just the memory that holds the on
625          * disk node. Check the freed list before allocating a new one:
626          */
627         list_for_each_entry(b, freed, list)
628                 if (!btree_node_reclaim(c, b, false)) {
629                         list_del_init(&b->list);
630                         goto got_node;
631                 }
632
633         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
634         if (!b) {
635                 mutex_unlock(&bc->lock);
636                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
637                 if (!b)
638                         goto err;
639                 mutex_lock(&bc->lock);
640         }
641
642         if (pcpu_read_locks)
643                 six_lock_pcpu_alloc(&b->c.lock);
644
645         BUG_ON(!six_trylock_intent(&b->c.lock));
646         BUG_ON(!six_trylock_write(&b->c.lock));
647 got_node:
648
649         /*
650          * btree_free() doesn't free memory; it sticks the node on the end of
651          * the list. Check if there's any freed nodes there:
652          */
653         list_for_each_entry(b2, &bc->freeable, list)
654                 if (!btree_node_reclaim(c, b2, false)) {
655                         swap(b->data, b2->data);
656                         swap(b->aux_data, b2->aux_data);
657                         btree_node_to_freedlist(bc, b2);
658                         six_unlock_write(&b2->c.lock);
659                         six_unlock_intent(&b2->c.lock);
660                         goto got_mem;
661                 }
662
663         mutex_unlock(&bc->lock);
664
665         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
666                 goto err;
667
668         mutex_lock(&bc->lock);
669         bc->used++;
670 got_mem:
671         mutex_unlock(&bc->lock);
672
673         BUG_ON(btree_node_hashed(b));
674         BUG_ON(btree_node_dirty(b));
675         BUG_ON(btree_node_write_in_flight(b));
676 out:
677         b->flags                = 0;
678         b->written              = 0;
679         b->nsets                = 0;
680         b->sib_u64s[0]          = 0;
681         b->sib_u64s[1]          = 0;
682         b->whiteout_u64s        = 0;
683         bch2_btree_keys_init(b);
684         set_btree_node_accessed(b);
685
686         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
687                                start_time);
688
689         memalloc_nofs_restore(flags);
690         return b;
691 err:
692         mutex_lock(&bc->lock);
693
694         /* Try to cannibalize another cached btree node: */
695         if (bc->alloc_lock == current) {
696                 b2 = btree_node_cannibalize(c);
697                 bch2_btree_node_hash_remove(bc, b2);
698
699                 if (b) {
700                         swap(b->data, b2->data);
701                         swap(b->aux_data, b2->aux_data);
702                         btree_node_to_freedlist(bc, b2);
703                         six_unlock_write(&b2->c.lock);
704                         six_unlock_intent(&b2->c.lock);
705                 } else {
706                         b = b2;
707                         list_del_init(&b->list);
708                 }
709
710                 mutex_unlock(&bc->lock);
711
712                 trace_and_count(c, btree_cache_cannibalize, c);
713                 goto out;
714         }
715
716         mutex_unlock(&bc->lock);
717         memalloc_nofs_restore(flags);
718         return ERR_PTR(-ENOMEM);
719 }
720
721 /* Slowpath, don't want it inlined into btree_iter_traverse() */
722 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
723                                 struct btree_trans *trans,
724                                 struct btree_path *path,
725                                 const struct bkey_i *k,
726                                 enum btree_id btree_id,
727                                 unsigned level,
728                                 enum six_lock_type lock_type,
729                                 bool sync)
730 {
731         struct btree_cache *bc = &c->btree_cache;
732         struct btree *b;
733         u32 seq;
734
735         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
736         /*
737          * Parent node must be locked, else we could read in a btree node that's
738          * been freed:
739          */
740         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
741                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
742                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
743         }
744
745         b = bch2_btree_node_mem_alloc(c, level != 0);
746
747         if (trans && b == ERR_PTR(-ENOMEM)) {
748                 trans->memory_allocation_failure = true;
749                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
750                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
751         }
752
753         if (IS_ERR(b))
754                 return b;
755
756         /*
757          * Btree nodes read in from disk should not have the accessed bit set
758          * initially, so that linear scans don't thrash the cache:
759          */
760         clear_btree_node_accessed(b);
761
762         bkey_copy(&b->key, k);
763         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
764                 /* raced with another fill: */
765
766                 /* mark as unhashed... */
767                 b->hash_val = 0;
768
769                 mutex_lock(&bc->lock);
770                 list_add(&b->list, &bc->freeable);
771                 mutex_unlock(&bc->lock);
772
773                 six_unlock_write(&b->c.lock);
774                 six_unlock_intent(&b->c.lock);
775                 return NULL;
776         }
777
778         set_btree_node_read_in_flight(b);
779
780         six_unlock_write(&b->c.lock);
781         seq = b->c.lock.state.seq;
782         six_unlock_intent(&b->c.lock);
783
784         /* Unlock before doing IO: */
785         if (trans && sync)
786                 bch2_trans_unlock(trans);
787
788         bch2_btree_node_read(c, b, sync);
789
790         if (!sync)
791                 return NULL;
792
793         if (trans) {
794                 int ret = bch2_trans_relock(trans) ?:
795                         bch2_btree_path_relock_intent(trans, path);
796                 if (ret) {
797                         BUG_ON(!trans->restarted);
798                         return ERR_PTR(ret);
799                 }
800         }
801
802         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
803                 if (trans)
804                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
805                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
806         }
807
808         return b;
809 }
810
811 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
812 {
813         struct printbuf buf = PRINTBUF;
814
815         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
816                 return;
817
818         prt_printf(&buf,
819                "btree node header doesn't match ptr\n"
820                "btree %s level %u\n"
821                "ptr: ",
822                bch2_btree_ids[b->c.btree_id], b->c.level);
823         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
824
825         prt_printf(&buf, "\nheader: btree %s level %llu\n"
826                "min ",
827                bch2_btree_ids[BTREE_NODE_ID(b->data)],
828                BTREE_NODE_LEVEL(b->data));
829         bch2_bpos_to_text(&buf, b->data->min_key);
830
831         prt_printf(&buf, "\nmax ");
832         bch2_bpos_to_text(&buf, b->data->max_key);
833
834         bch2_fs_inconsistent(c, "%s", buf.buf);
835         printbuf_exit(&buf);
836 }
837
838 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
839 {
840         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
841             b->c.level != BTREE_NODE_LEVEL(b->data) ||
842             !bpos_eq(b->data->max_key, b->key.k.p) ||
843             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
844              !bpos_eq(b->data->min_key,
845                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
846                 btree_bad_header(c, b);
847 }
848
849 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
850                                            const struct bkey_i *k, unsigned level,
851                                            enum six_lock_type lock_type,
852                                            unsigned long trace_ip)
853 {
854         struct bch_fs *c = trans->c;
855         struct btree_cache *bc = &c->btree_cache;
856         struct btree *b;
857         struct bset_tree *t;
858         int ret;
859
860         EBUG_ON(level >= BTREE_MAX_DEPTH);
861 retry:
862         b = btree_cache_find(bc, k);
863         if (unlikely(!b)) {
864                 /*
865                  * We must have the parent locked to call bch2_btree_node_fill(),
866                  * else we could read in a btree node from disk that's been
867                  * freed:
868                  */
869                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
870                                          level, lock_type, true);
871
872                 /* We raced and found the btree node in the cache */
873                 if (!b)
874                         goto retry;
875
876                 if (IS_ERR(b))
877                         return b;
878         } else {
879                 if (btree_node_read_locked(path, level + 1))
880                         btree_node_unlock(trans, path, level + 1);
881
882                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
883                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
884                         return ERR_PTR(ret);
885
886                 BUG_ON(ret);
887
888                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
889                              b->c.level != level ||
890                              race_fault())) {
891                         six_unlock_type(&b->c.lock, lock_type);
892                         if (bch2_btree_node_relock(trans, path, level + 1))
893                                 goto retry;
894
895                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
896                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
897                 }
898
899                 /* avoid atomic set bit if it's not needed: */
900                 if (!btree_node_accessed(b))
901                         set_btree_node_accessed(b);
902         }
903
904         if (unlikely(btree_node_read_in_flight(b))) {
905                 u32 seq = b->c.lock.state.seq;
906
907                 six_unlock_type(&b->c.lock, lock_type);
908                 bch2_trans_unlock(trans);
909
910                 bch2_btree_node_wait_on_read(b);
911
912                 /*
913                  * should_be_locked is not set on this path yet, so we need to
914                  * relock it specifically:
915                  */
916                 if (trans) {
917                         int ret = bch2_trans_relock(trans) ?:
918                                 bch2_btree_path_relock_intent(trans, path);
919                         if (ret) {
920                                 BUG_ON(!trans->restarted);
921                                 return ERR_PTR(ret);
922                         }
923                 }
924
925                 if (!six_relock_type(&b->c.lock, lock_type, seq))
926                         goto retry;
927         }
928
929         prefetch(b->aux_data);
930
931         for_each_bset(b, t) {
932                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
933
934                 prefetch(p + L1_CACHE_BYTES * 0);
935                 prefetch(p + L1_CACHE_BYTES * 1);
936                 prefetch(p + L1_CACHE_BYTES * 2);
937         }
938
939         if (unlikely(btree_node_read_error(b))) {
940                 six_unlock_type(&b->c.lock, lock_type);
941                 return ERR_PTR(-EIO);
942         }
943
944         EBUG_ON(b->c.btree_id != path->btree_id);
945         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
946         btree_check_header(c, b);
947
948         return b;
949 }
950
951 /**
952  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
953  * in from disk if necessary.
954  *
955  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
956  *
957  * The btree node will have either a read or a write lock held, depending on
958  * the @write parameter.
959  */
960 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
961                                   const struct bkey_i *k, unsigned level,
962                                   enum six_lock_type lock_type,
963                                   unsigned long trace_ip)
964 {
965         struct bch_fs *c = trans->c;
966         struct btree *b;
967         struct bset_tree *t;
968         int ret;
969
970         EBUG_ON(level >= BTREE_MAX_DEPTH);
971
972         b = btree_node_mem_ptr(k);
973
974         /*
975          * Check b->hash_val _before_ calling btree_node_lock() - this might not
976          * be the node we want anymore, and trying to lock the wrong node could
977          * cause an unneccessary transaction restart:
978          */
979         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
980                      !b ||
981                      b->hash_val != btree_ptr_hash_val(k)))
982                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
983
984         if (btree_node_read_locked(path, level + 1))
985                 btree_node_unlock(trans, path, level + 1);
986
987         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
988         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
989                 return ERR_PTR(ret);
990
991         BUG_ON(ret);
992
993         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
994                      b->c.level != level ||
995                      race_fault())) {
996                 six_unlock_type(&b->c.lock, lock_type);
997                 if (bch2_btree_node_relock(trans, path, level + 1))
998                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
999
1000                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1001                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1002         }
1003
1004         if (unlikely(btree_node_read_in_flight(b))) {
1005                 u32 seq = b->c.lock.state.seq;
1006
1007                 six_unlock_type(&b->c.lock, lock_type);
1008                 bch2_trans_unlock(trans);
1009
1010                 bch2_btree_node_wait_on_read(b);
1011
1012                 /*
1013                  * should_be_locked is not set on this path yet, so we need to
1014                  * relock it specifically:
1015                  */
1016                 if (trans) {
1017                         int ret = bch2_trans_relock(trans) ?:
1018                                 bch2_btree_path_relock_intent(trans, path);
1019                         if (ret) {
1020                                 BUG_ON(!trans->restarted);
1021                                 return ERR_PTR(ret);
1022                         }
1023                 }
1024
1025                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1026                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1027         }
1028
1029         prefetch(b->aux_data);
1030
1031         for_each_bset(b, t) {
1032                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1033
1034                 prefetch(p + L1_CACHE_BYTES * 0);
1035                 prefetch(p + L1_CACHE_BYTES * 1);
1036                 prefetch(p + L1_CACHE_BYTES * 2);
1037         }
1038
1039         /* avoid atomic set bit if it's not needed: */
1040         if (!btree_node_accessed(b))
1041                 set_btree_node_accessed(b);
1042
1043         if (unlikely(btree_node_read_error(b))) {
1044                 six_unlock_type(&b->c.lock, lock_type);
1045                 return ERR_PTR(-EIO);
1046         }
1047
1048         EBUG_ON(b->c.btree_id != path->btree_id);
1049         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1050         btree_check_header(c, b);
1051
1052         return b;
1053 }
1054
1055 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1056                                          const struct bkey_i *k,
1057                                          enum btree_id btree_id,
1058                                          unsigned level,
1059                                          bool nofill)
1060 {
1061         struct bch_fs *c = trans->c;
1062         struct btree_cache *bc = &c->btree_cache;
1063         struct btree *b;
1064         struct bset_tree *t;
1065         int ret;
1066
1067         EBUG_ON(level >= BTREE_MAX_DEPTH);
1068
1069         if (c->opts.btree_node_mem_ptr_optimization) {
1070                 b = btree_node_mem_ptr(k);
1071                 if (b)
1072                         goto lock_node;
1073         }
1074 retry:
1075         b = btree_cache_find(bc, k);
1076         if (unlikely(!b)) {
1077                 if (nofill)
1078                         goto out;
1079
1080                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1081                                          level, SIX_LOCK_read, true);
1082
1083                 /* We raced and found the btree node in the cache */
1084                 if (!b)
1085                         goto retry;
1086
1087                 if (IS_ERR(b) &&
1088                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1089                         goto retry;
1090
1091                 if (IS_ERR(b))
1092                         goto out;
1093         } else {
1094 lock_node:
1095                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
1096                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1097                         return ERR_PTR(ret);
1098
1099                 BUG_ON(ret);
1100
1101                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1102                              b->c.btree_id != btree_id ||
1103                              b->c.level != level)) {
1104                         six_unlock_read(&b->c.lock);
1105                         goto retry;
1106                 }
1107         }
1108
1109         /* XXX: waiting on IO with btree locks held: */
1110         __bch2_btree_node_wait_on_read(b);
1111
1112         prefetch(b->aux_data);
1113
1114         for_each_bset(b, t) {
1115                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1116
1117                 prefetch(p + L1_CACHE_BYTES * 0);
1118                 prefetch(p + L1_CACHE_BYTES * 1);
1119                 prefetch(p + L1_CACHE_BYTES * 2);
1120         }
1121
1122         /* avoid atomic set bit if it's not needed: */
1123         if (!btree_node_accessed(b))
1124                 set_btree_node_accessed(b);
1125
1126         if (unlikely(btree_node_read_error(b))) {
1127                 six_unlock_read(&b->c.lock);
1128                 b = ERR_PTR(-EIO);
1129                 goto out;
1130         }
1131
1132         EBUG_ON(b->c.btree_id != btree_id);
1133         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1134         btree_check_header(c, b);
1135 out:
1136         bch2_btree_cache_cannibalize_unlock(c);
1137         return b;
1138 }
1139
1140 int bch2_btree_node_prefetch(struct bch_fs *c,
1141                              struct btree_trans *trans,
1142                              struct btree_path *path,
1143                              const struct bkey_i *k,
1144                              enum btree_id btree_id, unsigned level)
1145 {
1146         struct btree_cache *bc = &c->btree_cache;
1147         struct btree *b;
1148
1149         BUG_ON(trans && !btree_node_locked(path, level + 1));
1150         BUG_ON(level >= BTREE_MAX_DEPTH);
1151
1152         b = btree_cache_find(bc, k);
1153         if (b)
1154                 return 0;
1155
1156         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1157                                  level, SIX_LOCK_read, false);
1158         return PTR_ERR_OR_ZERO(b);
1159 }
1160
1161 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1162 {
1163         struct bch_fs *c = trans->c;
1164         struct btree_cache *bc = &c->btree_cache;
1165         struct btree *b;
1166
1167         b = btree_cache_find(bc, k);
1168         if (!b)
1169                 return;
1170 wait_on_io:
1171         /* not allowed to wait on io with btree locks held: */
1172
1173         /* XXX we're called from btree_gc which will be holding other btree
1174          * nodes locked
1175          */
1176         __bch2_btree_node_wait_on_read(b);
1177         __bch2_btree_node_wait_on_write(b);
1178
1179         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1180         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1181
1182         if (btree_node_dirty(b)) {
1183                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1184                 six_unlock_write(&b->c.lock);
1185                 six_unlock_intent(&b->c.lock);
1186                 goto wait_on_io;
1187         }
1188
1189         BUG_ON(btree_node_dirty(b));
1190
1191         mutex_lock(&bc->lock);
1192         btree_node_data_free(c, b);
1193         bch2_btree_node_hash_remove(bc, b);
1194         mutex_unlock(&bc->lock);
1195
1196         six_unlock_write(&b->c.lock);
1197         six_unlock_intent(&b->c.lock);
1198 }
1199
1200 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1201                              struct btree *b)
1202 {
1203         const struct bkey_format *f = &b->format;
1204         struct bset_stats stats;
1205
1206         memset(&stats, 0, sizeof(stats));
1207
1208         bch2_btree_keys_stats(b, &stats);
1209
1210         prt_printf(out, "l %u ", b->c.level);
1211         bch2_bpos_to_text(out, b->data->min_key);
1212         prt_printf(out, " - ");
1213         bch2_bpos_to_text(out, b->data->max_key);
1214         prt_printf(out, ":\n"
1215                "    ptrs: ");
1216         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1217
1218         prt_printf(out, "\n"
1219                "    format: u64s %u fields %u %u %u %u %u\n"
1220                "    unpack fn len: %u\n"
1221                "    bytes used %zu/%zu (%zu%% full)\n"
1222                "    sib u64s: %u, %u (merge threshold %u)\n"
1223                "    nr packed keys %u\n"
1224                "    nr unpacked keys %u\n"
1225                "    floats %zu\n"
1226                "    failed unpacked %zu\n",
1227                f->key_u64s,
1228                f->bits_per_field[0],
1229                f->bits_per_field[1],
1230                f->bits_per_field[2],
1231                f->bits_per_field[3],
1232                f->bits_per_field[4],
1233                b->unpack_fn_len,
1234                b->nr.live_u64s * sizeof(u64),
1235                btree_bytes(c) - sizeof(struct btree_node),
1236                b->nr.live_u64s * 100 / btree_max_u64s(c),
1237                b->sib_u64s[0],
1238                b->sib_u64s[1],
1239                c->btree_foreground_merge_threshold,
1240                b->nr.packed_keys,
1241                b->nr.unpacked_keys,
1242                stats.floats,
1243                stats.failed);
1244 }
1245
1246 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1247 {
1248         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1249         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1250         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1251
1252         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1253         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1254         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1255         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1256         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1257         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1258         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1259         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1260         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1261         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1262
1263 }