]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 8d3fc97ca3 bcachefs: Fixes for building in userspace
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
18 do {                                             \
19         if (shrinker_counter)                    \
20                 bc->not_freed_##counter++;       \
21 } while (0)
22
23 const char * const bch2_btree_node_flags[] = {
24 #define x(f)    #f,
25         BTREE_FLAGS()
26 #undef x
27         NULL
28 };
29
30 void bch2_recalc_btree_reserve(struct bch_fs *c)
31 {
32         unsigned i, reserve = 16;
33
34         if (!c->btree_roots[0].b)
35                 reserve += 8;
36
37         for (i = 0; i < BTREE_ID_NR; i++)
38                 if (c->btree_roots[i].b)
39                         reserve += min_t(unsigned, 1,
40                                          c->btree_roots[i].b->c.level) * 8;
41
42         c->btree_cache.reserve = reserve;
43 }
44
45 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
46 {
47         return max_t(int, 0, bc->used - bc->reserve);
48 }
49
50 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
51 {
52         if (b->c.lock.readers)
53                 list_move(&b->list, &bc->freed_pcpu);
54         else
55                 list_move(&b->list, &bc->freed_nonpcpu);
56 }
57
58 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
59 {
60         struct btree_cache *bc = &c->btree_cache;
61
62         EBUG_ON(btree_node_write_in_flight(b));
63
64         kvpfree(b->data, btree_bytes(c));
65         b->data = NULL;
66 #ifdef __KERNEL__
67         vfree(b->aux_data);
68 #else
69         munmap(b->aux_data, btree_aux_data_bytes(b));
70 #endif
71         b->aux_data = NULL;
72
73         bc->used--;
74
75         btree_node_to_freedlist(bc, b);
76 }
77
78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
79                                    const void *obj)
80 {
81         const struct btree *b = obj;
82         const u64 *v = arg->key;
83
84         return b->hash_val == *v ? 0 : 1;
85 }
86
87 static const struct rhashtable_params bch_btree_cache_params = {
88         .head_offset    = offsetof(struct btree, hash),
89         .key_offset     = offsetof(struct btree, hash_val),
90         .key_len        = sizeof(u64),
91         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
92 };
93
94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
95 {
96         BUG_ON(b->data || b->aux_data);
97
98         b->data = kvpmalloc(btree_bytes(c), gfp);
99         if (!b->data)
100                 return -ENOMEM;
101 #ifdef __KERNEL__
102         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
103 #else
104         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
105                            PROT_READ|PROT_WRITE|PROT_EXEC,
106                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
107         if (b->aux_data == MAP_FAILED)
108                 b->aux_data = NULL;
109 #endif
110         if (!b->aux_data) {
111                 kvpfree(b->data, btree_bytes(c));
112                 b->data = NULL;
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
120 {
121         struct btree *b;
122
123         b = kzalloc(sizeof(struct btree), gfp);
124         if (!b)
125                 return NULL;
126
127         bkey_btree_ptr_init(&b->key);
128         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
129 #ifdef CONFIG_DEBUG_LOCK_ALLOC
130         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
131 #endif
132         INIT_LIST_HEAD(&b->list);
133         INIT_LIST_HEAD(&b->write_blocked);
134         b->byte_order = ilog2(btree_bytes(c));
135         return b;
136 }
137
138 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
139 {
140         struct btree_cache *bc = &c->btree_cache;
141         struct btree *b;
142
143         b = __btree_node_mem_alloc(c, GFP_KERNEL);
144         if (!b)
145                 return NULL;
146
147         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
148                 kfree(b);
149                 return NULL;
150         }
151
152         bc->used++;
153         list_add(&b->list, &bc->freeable);
154         return b;
155 }
156
157 /* Btree in memory cache - hash table */
158
159 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
160 {
161         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
162
163         BUG_ON(ret);
164
165         /* Cause future lookups for this node to fail: */
166         b->hash_val = 0;
167 }
168
169 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
170 {
171         BUG_ON(b->hash_val);
172         b->hash_val = btree_ptr_hash_val(&b->key);
173
174         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
175                                              bch_btree_cache_params);
176 }
177
178 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
179                                 unsigned level, enum btree_id id)
180 {
181         int ret;
182
183         b->c.level      = level;
184         b->c.btree_id   = id;
185
186         mutex_lock(&bc->lock);
187         ret = __bch2_btree_node_hash_insert(bc, b);
188         if (!ret)
189                 list_add_tail(&b->list, &bc->live);
190         mutex_unlock(&bc->lock);
191
192         return ret;
193 }
194
195 __flatten
196 static inline struct btree *btree_cache_find(struct btree_cache *bc,
197                                      const struct bkey_i *k)
198 {
199         u64 v = btree_ptr_hash_val(k);
200
201         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
202 }
203
204 /*
205  * this version is for btree nodes that have already been freed (we're not
206  * reaping a real btree node)
207  */
208 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
209 {
210         struct btree_cache *bc = &c->btree_cache;
211         int ret = 0;
212
213         lockdep_assert_held(&bc->lock);
214 wait_on_io:
215         if (b->flags & ((1U << BTREE_NODE_dirty)|
216                         (1U << BTREE_NODE_read_in_flight)|
217                         (1U << BTREE_NODE_write_in_flight))) {
218                 if (!flush) {
219                         if (btree_node_dirty(b))
220                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
221                         else if (btree_node_read_in_flight(b))
222                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
223                         else if (btree_node_write_in_flight(b))
224                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
225                         return -ENOMEM;
226                 }
227
228                 /* XXX: waiting on IO with btree cache lock held */
229                 bch2_btree_node_wait_on_read(b);
230                 bch2_btree_node_wait_on_write(b);
231         }
232
233         if (!six_trylock_intent(&b->c.lock)) {
234                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
235                 return -ENOMEM;
236         }
237
238         if (!six_trylock_write(&b->c.lock)) {
239                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
240                 goto out_unlock_intent;
241         }
242
243         /* recheck under lock */
244         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
245                         (1U << BTREE_NODE_write_in_flight))) {
246                 if (!flush) {
247                         if (btree_node_read_in_flight(b))
248                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
249                         else if (btree_node_write_in_flight(b))
250                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
251                         goto out_unlock;
252                 }
253                 six_unlock_write(&b->c.lock);
254                 six_unlock_intent(&b->c.lock);
255                 goto wait_on_io;
256         }
257
258         if (btree_node_noevict(b)) {
259                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
260                 goto out_unlock;
261         }
262         if (btree_node_write_blocked(b)) {
263                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
264                 goto out_unlock;
265         }
266         if (btree_node_will_make_reachable(b)) {
267                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
268                 goto out_unlock;
269         }
270
271         if (btree_node_dirty(b)) {
272                 if (!flush) {
273                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
274                         goto out_unlock;
275                 }
276                 /*
277                  * Using the underscore version because we don't want to compact
278                  * bsets after the write, since this node is about to be evicted
279                  * - unless btree verify mode is enabled, since it runs out of
280                  * the post write cleanup:
281                  */
282                 if (bch2_verify_btree_ondisk)
283                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
284                                               BTREE_WRITE_cache_reclaim);
285                 else
286                         __bch2_btree_node_write(c, b,
287                                                 BTREE_WRITE_cache_reclaim);
288
289                 six_unlock_write(&b->c.lock);
290                 six_unlock_intent(&b->c.lock);
291                 goto wait_on_io;
292         }
293 out:
294         if (b->hash_val && !ret)
295                 trace_and_count(c, btree_cache_reap, c, b);
296         return ret;
297 out_unlock:
298         six_unlock_write(&b->c.lock);
299 out_unlock_intent:
300         six_unlock_intent(&b->c.lock);
301         ret = -ENOMEM;
302         goto out;
303 }
304
305 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
306 {
307         return __btree_node_reclaim(c, b, false, shrinker_counter);
308 }
309
310 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
311 {
312         return __btree_node_reclaim(c, b, true, false);
313 }
314
315 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
316                                            struct shrink_control *sc)
317 {
318         struct bch_fs *c = container_of(shrink, struct bch_fs,
319                                         btree_cache.shrink);
320         struct btree_cache *bc = &c->btree_cache;
321         struct btree *b, *t;
322         unsigned long nr = sc->nr_to_scan;
323         unsigned long can_free = 0;
324         unsigned long freed = 0;
325         unsigned long touched = 0;
326         unsigned i, flags;
327         unsigned long ret = SHRINK_STOP;
328         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
329                 bc->used * 3 / 4;
330
331         if (bch2_btree_shrinker_disabled)
332                 return SHRINK_STOP;
333
334         mutex_lock(&bc->lock);
335         flags = memalloc_nofs_save();
336
337         /*
338          * It's _really_ critical that we don't free too many btree nodes - we
339          * have to always leave ourselves a reserve. The reserve is how we
340          * guarantee that allocating memory for a new btree node can always
341          * succeed, so that inserting keys into the btree can always succeed and
342          * IO can always make forward progress:
343          */
344         can_free = btree_cache_can_free(bc);
345         nr = min_t(unsigned long, nr, can_free);
346
347         i = 0;
348         list_for_each_entry_safe(b, t, &bc->freeable, list) {
349                 /*
350                  * Leave a few nodes on the freeable list, so that a btree split
351                  * won't have to hit the system allocator:
352                  */
353                 if (++i <= 3)
354                         continue;
355
356                 touched++;
357
358                 if (touched >= nr)
359                         goto out;
360
361                 if (!btree_node_reclaim(c, b, true)) {
362                         btree_node_data_free(c, b);
363                         six_unlock_write(&b->c.lock);
364                         six_unlock_intent(&b->c.lock);
365                         freed++;
366                         bc->freed++;
367                 }
368         }
369 restart:
370         list_for_each_entry_safe(b, t, &bc->live, list) {
371                 touched++;
372
373                 if (btree_node_accessed(b)) {
374                         clear_btree_node_accessed(b);
375                         bc->not_freed_access_bit++;
376                 } else if (!btree_node_reclaim(c, b, true)) {
377                         freed++;
378                         btree_node_data_free(c, b);
379                         bc->freed++;
380
381                         bch2_btree_node_hash_remove(bc, b);
382                         six_unlock_write(&b->c.lock);
383                         six_unlock_intent(&b->c.lock);
384
385                         if (freed == nr)
386                                 goto out_rotate;
387                 } else if (trigger_writes &&
388                            btree_node_dirty(b) &&
389                            !btree_node_will_make_reachable(b) &&
390                            !btree_node_write_blocked(b) &&
391                            six_trylock_read(&b->c.lock)) {
392                         list_move(&bc->live, &b->list);
393                         mutex_unlock(&bc->lock);
394                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
395                         six_unlock_read(&b->c.lock);
396                         if (touched >= nr)
397                                 goto out_nounlock;
398                         mutex_lock(&bc->lock);
399                         goto restart;
400                 }
401
402                 if (touched >= nr)
403                         break;
404         }
405 out_rotate:
406         if (&t->list != &bc->live)
407                 list_move_tail(&bc->live, &t->list);
408 out:
409         mutex_unlock(&bc->lock);
410 out_nounlock:
411         ret = freed;
412         memalloc_nofs_restore(flags);
413         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
414         return ret;
415 }
416
417 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
418                                             struct shrink_control *sc)
419 {
420         struct bch_fs *c = container_of(shrink, struct bch_fs,
421                                         btree_cache.shrink);
422         struct btree_cache *bc = &c->btree_cache;
423
424         if (bch2_btree_shrinker_disabled)
425                 return 0;
426
427         return btree_cache_can_free(bc);
428 }
429
430 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
431 {
432         struct bch_fs *c = container_of(shrink, struct bch_fs,
433                                         btree_cache.shrink);
434
435         bch2_btree_cache_to_text(out, &c->btree_cache);
436 }
437
438 void bch2_fs_btree_cache_exit(struct bch_fs *c)
439 {
440         struct btree_cache *bc = &c->btree_cache;
441         struct btree *b;
442         unsigned i, flags;
443
444         if (bc->shrink.list.next)
445                 unregister_shrinker(&bc->shrink);
446
447         /* vfree() can allocate memory: */
448         flags = memalloc_nofs_save();
449         mutex_lock(&bc->lock);
450
451         if (c->verify_data)
452                 list_move(&c->verify_data->list, &bc->live);
453
454         kvpfree(c->verify_ondisk, btree_bytes(c));
455
456         for (i = 0; i < BTREE_ID_NR; i++)
457                 if (c->btree_roots[i].b)
458                         list_add(&c->btree_roots[i].b->list, &bc->live);
459
460         list_splice(&bc->freeable, &bc->live);
461
462         while (!list_empty(&bc->live)) {
463                 b = list_first_entry(&bc->live, struct btree, list);
464
465                 BUG_ON(btree_node_read_in_flight(b) ||
466                        btree_node_write_in_flight(b));
467
468                 if (btree_node_dirty(b))
469                         bch2_btree_complete_write(c, b, btree_current_write(b));
470                 clear_btree_node_dirty_acct(c, b);
471
472                 btree_node_data_free(c, b);
473         }
474
475         BUG_ON(atomic_read(&c->btree_cache.dirty));
476
477         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
478
479         while (!list_empty(&bc->freed_nonpcpu)) {
480                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
481                 list_del(&b->list);
482                 six_lock_pcpu_free(&b->c.lock);
483                 kfree(b);
484         }
485
486         mutex_unlock(&bc->lock);
487         memalloc_nofs_restore(flags);
488
489         if (bc->table_init_done)
490                 rhashtable_destroy(&bc->table);
491 }
492
493 int bch2_fs_btree_cache_init(struct bch_fs *c)
494 {
495         struct btree_cache *bc = &c->btree_cache;
496         unsigned i;
497         int ret = 0;
498
499         pr_verbose_init(c->opts, "");
500
501         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
502         if (ret)
503                 goto out;
504
505         bc->table_init_done = true;
506
507         bch2_recalc_btree_reserve(c);
508
509         for (i = 0; i < bc->reserve; i++)
510                 if (!__bch2_btree_node_mem_alloc(c)) {
511                         ret = -ENOMEM;
512                         goto out;
513                 }
514
515         list_splice_init(&bc->live, &bc->freeable);
516
517         mutex_init(&c->verify_lock);
518
519         bc->shrink.count_objects        = bch2_btree_cache_count;
520         bc->shrink.scan_objects         = bch2_btree_cache_scan;
521         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
522         bc->shrink.seeks                = 4;
523         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
524 out:
525         pr_verbose_init(c->opts, "ret %i", ret);
526         return ret;
527 }
528
529 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
530 {
531         mutex_init(&bc->lock);
532         INIT_LIST_HEAD(&bc->live);
533         INIT_LIST_HEAD(&bc->freeable);
534         INIT_LIST_HEAD(&bc->freed_pcpu);
535         INIT_LIST_HEAD(&bc->freed_nonpcpu);
536 }
537
538 /*
539  * We can only have one thread cannibalizing other cached btree nodes at a time,
540  * or we'll deadlock. We use an open coded mutex to ensure that, which a
541  * cannibalize_bucket() will take. This means every time we unlock the root of
542  * the btree, we need to release this lock if we have it held.
543  */
544 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
545 {
546         struct btree_cache *bc = &c->btree_cache;
547
548         if (bc->alloc_lock == current) {
549                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
550                 bc->alloc_lock = NULL;
551                 closure_wake_up(&bc->alloc_wait);
552         }
553 }
554
555 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
556 {
557         struct btree_cache *bc = &c->btree_cache;
558         struct task_struct *old;
559
560         old = cmpxchg(&bc->alloc_lock, NULL, current);
561         if (old == NULL || old == current)
562                 goto success;
563
564         if (!cl) {
565                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
566                 return -ENOMEM;
567         }
568
569         closure_wait(&bc->alloc_wait, cl);
570
571         /* Try again, after adding ourselves to waitlist */
572         old = cmpxchg(&bc->alloc_lock, NULL, current);
573         if (old == NULL || old == current) {
574                 /* We raced */
575                 closure_wake_up(&bc->alloc_wait);
576                 goto success;
577         }
578
579         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
580         return -EAGAIN;
581
582 success:
583         trace_and_count(c, btree_cache_cannibalize_lock, c);
584         return 0;
585 }
586
587 static struct btree *btree_node_cannibalize(struct bch_fs *c)
588 {
589         struct btree_cache *bc = &c->btree_cache;
590         struct btree *b;
591
592         list_for_each_entry_reverse(b, &bc->live, list)
593                 if (!btree_node_reclaim(c, b, false))
594                         return b;
595
596         while (1) {
597                 list_for_each_entry_reverse(b, &bc->live, list)
598                         if (!btree_node_write_and_reclaim(c, b))
599                                 return b;
600
601                 /*
602                  * Rare case: all nodes were intent-locked.
603                  * Just busy-wait.
604                  */
605                 WARN_ONCE(1, "btree cache cannibalize failed\n");
606                 cond_resched();
607         }
608 }
609
610 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
611 {
612         struct btree_cache *bc = &c->btree_cache;
613         struct list_head *freed = pcpu_read_locks
614                 ? &bc->freed_pcpu
615                 : &bc->freed_nonpcpu;
616         struct btree *b, *b2;
617         u64 start_time = local_clock();
618         unsigned flags;
619
620         flags = memalloc_nofs_save();
621         mutex_lock(&bc->lock);
622
623         /*
624          * We never free struct btree itself, just the memory that holds the on
625          * disk node. Check the freed list before allocating a new one:
626          */
627         list_for_each_entry(b, freed, list)
628                 if (!btree_node_reclaim(c, b, false)) {
629                         list_del_init(&b->list);
630                         goto got_node;
631                 }
632
633         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
634         if (!b) {
635                 mutex_unlock(&bc->lock);
636                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
637                 if (!b)
638                         goto err;
639                 mutex_lock(&bc->lock);
640         }
641
642         if (pcpu_read_locks)
643                 six_lock_pcpu_alloc(&b->c.lock);
644
645         BUG_ON(!six_trylock_intent(&b->c.lock));
646         BUG_ON(!six_trylock_write(&b->c.lock));
647 got_node:
648
649         /*
650          * btree_free() doesn't free memory; it sticks the node on the end of
651          * the list. Check if there's any freed nodes there:
652          */
653         list_for_each_entry(b2, &bc->freeable, list)
654                 if (!btree_node_reclaim(c, b2, false)) {
655                         swap(b->data, b2->data);
656                         swap(b->aux_data, b2->aux_data);
657                         btree_node_to_freedlist(bc, b2);
658                         six_unlock_write(&b2->c.lock);
659                         six_unlock_intent(&b2->c.lock);
660                         goto got_mem;
661                 }
662
663         mutex_unlock(&bc->lock);
664
665         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
666                 goto err;
667
668         mutex_lock(&bc->lock);
669         bc->used++;
670 got_mem:
671         mutex_unlock(&bc->lock);
672
673         BUG_ON(btree_node_hashed(b));
674         BUG_ON(btree_node_dirty(b));
675         BUG_ON(btree_node_write_in_flight(b));
676 out:
677         b->flags                = 0;
678         b->written              = 0;
679         b->nsets                = 0;
680         b->write_type           = 0;
681         b->sib_u64s[0]          = 0;
682         b->sib_u64s[1]          = 0;
683         b->whiteout_u64s        = 0;
684         bch2_btree_keys_init(b);
685         set_btree_node_accessed(b);
686
687         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
688                                start_time);
689
690         memalloc_nofs_restore(flags);
691         return b;
692 err:
693         mutex_lock(&bc->lock);
694
695         /* Try to cannibalize another cached btree node: */
696         if (bc->alloc_lock == current) {
697                 b2 = btree_node_cannibalize(c);
698                 bch2_btree_node_hash_remove(bc, b2);
699
700                 if (b) {
701                         swap(b->data, b2->data);
702                         swap(b->aux_data, b2->aux_data);
703                         btree_node_to_freedlist(bc, b2);
704                         six_unlock_write(&b2->c.lock);
705                         six_unlock_intent(&b2->c.lock);
706                 } else {
707                         b = b2;
708                         list_del_init(&b->list);
709                 }
710
711                 mutex_unlock(&bc->lock);
712
713                 trace_and_count(c, btree_cache_cannibalize, c);
714                 goto out;
715         }
716
717         mutex_unlock(&bc->lock);
718         memalloc_nofs_restore(flags);
719         return ERR_PTR(-ENOMEM);
720 }
721
722 /* Slowpath, don't want it inlined into btree_iter_traverse() */
723 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
724                                 struct btree_trans *trans,
725                                 struct btree_path *path,
726                                 const struct bkey_i *k,
727                                 enum btree_id btree_id,
728                                 unsigned level,
729                                 enum six_lock_type lock_type,
730                                 bool sync)
731 {
732         struct btree_cache *bc = &c->btree_cache;
733         struct btree *b;
734         u32 seq;
735
736         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
737         /*
738          * Parent node must be locked, else we could read in a btree node that's
739          * been freed:
740          */
741         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
742                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
743                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
744         }
745
746         b = bch2_btree_node_mem_alloc(c, level != 0);
747
748         if (trans && b == ERR_PTR(-ENOMEM)) {
749                 trans->memory_allocation_failure = true;
750                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
751                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
752         }
753
754         if (IS_ERR(b))
755                 return b;
756
757         bkey_copy(&b->key, k);
758         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
759                 /* raced with another fill: */
760
761                 /* mark as unhashed... */
762                 b->hash_val = 0;
763
764                 mutex_lock(&bc->lock);
765                 list_add(&b->list, &bc->freeable);
766                 mutex_unlock(&bc->lock);
767
768                 six_unlock_write(&b->c.lock);
769                 six_unlock_intent(&b->c.lock);
770                 return NULL;
771         }
772
773         set_btree_node_read_in_flight(b);
774
775         six_unlock_write(&b->c.lock);
776         seq = b->c.lock.state.seq;
777         six_unlock_intent(&b->c.lock);
778
779         /* Unlock before doing IO: */
780         if (trans && sync)
781                 bch2_trans_unlock(trans);
782
783         bch2_btree_node_read(c, b, sync);
784
785         if (!sync)
786                 return NULL;
787
788         if (trans) {
789                 int ret = bch2_trans_relock(trans) ?:
790                         bch2_btree_path_relock_intent(trans, path);
791                 if (ret) {
792                         BUG_ON(!trans->restarted);
793                         return ERR_PTR(ret);
794                 }
795         }
796
797         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
798                 if (trans)
799                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
800                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
801         }
802
803         return b;
804 }
805
806 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
807 {
808         struct printbuf buf = PRINTBUF;
809
810         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
811                 return;
812
813         prt_printf(&buf,
814                "btree node header doesn't match ptr\n"
815                "btree %s level %u\n"
816                "ptr: ",
817                bch2_btree_ids[b->c.btree_id], b->c.level);
818         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
819
820         prt_printf(&buf, "\nheader: btree %s level %llu\n"
821                "min ",
822                bch2_btree_ids[BTREE_NODE_ID(b->data)],
823                BTREE_NODE_LEVEL(b->data));
824         bch2_bpos_to_text(&buf, b->data->min_key);
825
826         prt_printf(&buf, "\nmax ");
827         bch2_bpos_to_text(&buf, b->data->max_key);
828
829         bch2_fs_inconsistent(c, "%s", buf.buf);
830         printbuf_exit(&buf);
831 }
832
833 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
834 {
835         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
836             b->c.level != BTREE_NODE_LEVEL(b->data) ||
837             bpos_cmp(b->data->max_key, b->key.k.p) ||
838             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
839              bpos_cmp(b->data->min_key,
840                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
841                 btree_bad_header(c, b);
842 }
843
844 /**
845  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
846  * in from disk if necessary.
847  *
848  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
849  *
850  * The btree node will have either a read or a write lock held, depending on
851  * the @write parameter.
852  */
853 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
854                                   const struct bkey_i *k, unsigned level,
855                                   enum six_lock_type lock_type,
856                                   unsigned long trace_ip)
857 {
858         struct bch_fs *c = trans->c;
859         struct btree_cache *bc = &c->btree_cache;
860         struct btree *b;
861         struct bset_tree *t;
862         int ret;
863
864         EBUG_ON(level >= BTREE_MAX_DEPTH);
865
866         b = btree_node_mem_ptr(k);
867
868         /*
869          * Check b->hash_val _before_ calling btree_node_lock() - this might not
870          * be the node we want anymore, and trying to lock the wrong node could
871          * cause an unneccessary transaction restart:
872          */
873         if (likely(c->opts.btree_node_mem_ptr_optimization &&
874                    b &&
875                    b->hash_val == btree_ptr_hash_val(k)))
876                 goto lock_node;
877 retry:
878         b = btree_cache_find(bc, k);
879         if (unlikely(!b)) {
880                 /*
881                  * We must have the parent locked to call bch2_btree_node_fill(),
882                  * else we could read in a btree node from disk that's been
883                  * freed:
884                  */
885                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
886                                          level, lock_type, true);
887
888                 /* We raced and found the btree node in the cache */
889                 if (!b)
890                         goto retry;
891
892                 if (IS_ERR(b))
893                         return b;
894         } else {
895 lock_node:
896                 /*
897                  * There's a potential deadlock with splits and insertions into
898                  * interior nodes we have to avoid:
899                  *
900                  * The other thread might be holding an intent lock on the node
901                  * we want, and they want to update its parent node so they're
902                  * going to upgrade their intent lock on the parent node to a
903                  * write lock.
904                  *
905                  * But if we're holding a read lock on the parent, and we're
906                  * trying to get the intent lock they're holding, we deadlock.
907                  *
908                  * So to avoid this we drop the read locks on parent nodes when
909                  * we're starting to take intent locks - and handle the race.
910                  *
911                  * The race is that they might be about to free the node we
912                  * want, and dropping our read lock on the parent node lets them
913                  * update the parent marking the node we want as freed, and then
914                  * free it:
915                  *
916                  * To guard against this, btree nodes are evicted from the cache
917                  * when they're freed - and b->hash_val is zeroed out, which we
918                  * check for after we lock the node.
919                  *
920                  * Then, bch2_btree_node_relock() on the parent will fail - because
921                  * the parent was modified, when the pointer to the node we want
922                  * was removed - and we'll bail out:
923                  */
924                 if (btree_node_read_locked(path, level + 1))
925                         btree_node_unlock(trans, path, level + 1);
926
927                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
928                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
929                         return ERR_PTR(ret);
930
931                 BUG_ON(ret);
932
933                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
934                              b->c.level != level ||
935                              race_fault())) {
936                         six_unlock_type(&b->c.lock, lock_type);
937                         if (bch2_btree_node_relock(trans, path, level + 1))
938                                 goto retry;
939
940                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
941                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
942                 }
943         }
944
945         if (unlikely(btree_node_read_in_flight(b))) {
946                 u32 seq = b->c.lock.state.seq;
947
948                 six_unlock_type(&b->c.lock, lock_type);
949                 bch2_trans_unlock(trans);
950
951                 bch2_btree_node_wait_on_read(b);
952
953                 /*
954                  * should_be_locked is not set on this path yet, so we need to
955                  * relock it specifically:
956                  */
957                 if (trans) {
958                         int ret = bch2_trans_relock(trans) ?:
959                                 bch2_btree_path_relock_intent(trans, path);
960                         if (ret) {
961                                 BUG_ON(!trans->restarted);
962                                 return ERR_PTR(ret);
963                         }
964                 }
965
966                 if (!six_relock_type(&b->c.lock, lock_type, seq))
967                         goto retry;
968         }
969
970         prefetch(b->aux_data);
971
972         for_each_bset(b, t) {
973                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
974
975                 prefetch(p + L1_CACHE_BYTES * 0);
976                 prefetch(p + L1_CACHE_BYTES * 1);
977                 prefetch(p + L1_CACHE_BYTES * 2);
978         }
979
980         /* avoid atomic set bit if it's not needed: */
981         if (!btree_node_accessed(b))
982                 set_btree_node_accessed(b);
983
984         if (unlikely(btree_node_read_error(b))) {
985                 six_unlock_type(&b->c.lock, lock_type);
986                 return ERR_PTR(-EIO);
987         }
988
989         EBUG_ON(b->c.btree_id != path->btree_id);
990         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
991         btree_check_header(c, b);
992
993         return b;
994 }
995
996 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
997                                          const struct bkey_i *k,
998                                          enum btree_id btree_id,
999                                          unsigned level,
1000                                          bool nofill)
1001 {
1002         struct bch_fs *c = trans->c;
1003         struct btree_cache *bc = &c->btree_cache;
1004         struct btree *b;
1005         struct bset_tree *t;
1006         int ret;
1007
1008         EBUG_ON(level >= BTREE_MAX_DEPTH);
1009
1010         if (c->opts.btree_node_mem_ptr_optimization) {
1011                 b = btree_node_mem_ptr(k);
1012                 if (b)
1013                         goto lock_node;
1014         }
1015 retry:
1016         b = btree_cache_find(bc, k);
1017         if (unlikely(!b)) {
1018                 if (nofill)
1019                         goto out;
1020
1021                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1022                                          level, SIX_LOCK_read, true);
1023
1024                 /* We raced and found the btree node in the cache */
1025                 if (!b)
1026                         goto retry;
1027
1028                 if (IS_ERR(b) &&
1029                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1030                         goto retry;
1031
1032                 if (IS_ERR(b))
1033                         goto out;
1034         } else {
1035 lock_node:
1036                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
1037                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1038                         return ERR_PTR(ret);
1039
1040                 BUG_ON(ret);
1041
1042                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1043                              b->c.btree_id != btree_id ||
1044                              b->c.level != level)) {
1045                         six_unlock_read(&b->c.lock);
1046                         goto retry;
1047                 }
1048         }
1049
1050         /* XXX: waiting on IO with btree locks held: */
1051         __bch2_btree_node_wait_on_read(b);
1052
1053         prefetch(b->aux_data);
1054
1055         for_each_bset(b, t) {
1056                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1057
1058                 prefetch(p + L1_CACHE_BYTES * 0);
1059                 prefetch(p + L1_CACHE_BYTES * 1);
1060                 prefetch(p + L1_CACHE_BYTES * 2);
1061         }
1062
1063         /* avoid atomic set bit if it's not needed: */
1064         if (!btree_node_accessed(b))
1065                 set_btree_node_accessed(b);
1066
1067         if (unlikely(btree_node_read_error(b))) {
1068                 six_unlock_read(&b->c.lock);
1069                 b = ERR_PTR(-EIO);
1070                 goto out;
1071         }
1072
1073         EBUG_ON(b->c.btree_id != btree_id);
1074         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1075         btree_check_header(c, b);
1076 out:
1077         bch2_btree_cache_cannibalize_unlock(c);
1078         return b;
1079 }
1080
1081 int bch2_btree_node_prefetch(struct bch_fs *c,
1082                              struct btree_trans *trans,
1083                              struct btree_path *path,
1084                              const struct bkey_i *k,
1085                              enum btree_id btree_id, unsigned level)
1086 {
1087         struct btree_cache *bc = &c->btree_cache;
1088         struct btree *b;
1089
1090         BUG_ON(trans && !btree_node_locked(path, level + 1));
1091         BUG_ON(level >= BTREE_MAX_DEPTH);
1092
1093         b = btree_cache_find(bc, k);
1094         if (b)
1095                 return 0;
1096
1097         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1098                                  level, SIX_LOCK_read, false);
1099         return PTR_ERR_OR_ZERO(b);
1100 }
1101
1102 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1103 {
1104         struct bch_fs *c = trans->c;
1105         struct btree_cache *bc = &c->btree_cache;
1106         struct btree *b;
1107
1108         b = btree_cache_find(bc, k);
1109         if (!b)
1110                 return;
1111 wait_on_io:
1112         /* not allowed to wait on io with btree locks held: */
1113
1114         /* XXX we're called from btree_gc which will be holding other btree
1115          * nodes locked
1116          */
1117         __bch2_btree_node_wait_on_read(b);
1118         __bch2_btree_node_wait_on_write(b);
1119
1120         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1121         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1122
1123         if (btree_node_dirty(b)) {
1124                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1125                 six_unlock_write(&b->c.lock);
1126                 six_unlock_intent(&b->c.lock);
1127                 goto wait_on_io;
1128         }
1129
1130         BUG_ON(btree_node_dirty(b));
1131
1132         mutex_lock(&bc->lock);
1133         btree_node_data_free(c, b);
1134         bch2_btree_node_hash_remove(bc, b);
1135         mutex_unlock(&bc->lock);
1136
1137         six_unlock_write(&b->c.lock);
1138         six_unlock_intent(&b->c.lock);
1139 }
1140
1141 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1142                              struct btree *b)
1143 {
1144         const struct bkey_format *f = &b->format;
1145         struct bset_stats stats;
1146
1147         memset(&stats, 0, sizeof(stats));
1148
1149         bch2_btree_keys_stats(b, &stats);
1150
1151         prt_printf(out, "l %u ", b->c.level);
1152         bch2_bpos_to_text(out, b->data->min_key);
1153         prt_printf(out, " - ");
1154         bch2_bpos_to_text(out, b->data->max_key);
1155         prt_printf(out, ":\n"
1156                "    ptrs: ");
1157         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1158
1159         prt_printf(out, "\n"
1160                "    format: u64s %u fields %u %u %u %u %u\n"
1161                "    unpack fn len: %u\n"
1162                "    bytes used %zu/%zu (%zu%% full)\n"
1163                "    sib u64s: %u, %u (merge threshold %u)\n"
1164                "    nr packed keys %u\n"
1165                "    nr unpacked keys %u\n"
1166                "    floats %zu\n"
1167                "    failed unpacked %zu\n",
1168                f->key_u64s,
1169                f->bits_per_field[0],
1170                f->bits_per_field[1],
1171                f->bits_per_field[2],
1172                f->bits_per_field[3],
1173                f->bits_per_field[4],
1174                b->unpack_fn_len,
1175                b->nr.live_u64s * sizeof(u64),
1176                btree_bytes(c) - sizeof(struct btree_node),
1177                b->nr.live_u64s * 100 / btree_max_u64s(c),
1178                b->sib_u64s[0],
1179                b->sib_u64s[1],
1180                c->btree_foreground_merge_threshold,
1181                b->nr.packed_keys,
1182                b->nr.unpacked_keys,
1183                stats.floats,
1184                stats.failed);
1185 }
1186
1187 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1188 {
1189         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1190         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1191         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1192
1193         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1194         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1195         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1196         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1197         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1198         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1199         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1200         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1201         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1202         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1203
1204 }