]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
2ca0a9d8226b1610f3b842f21a1d035f61685728
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
18 do {                                             \
19         if (shrinker_counter)                    \
20                 bc->not_freed_##counter++;       \
21 } while (0)
22
23 const char * const bch2_btree_node_flags[] = {
24 #define x(f)    #f,
25         BTREE_FLAGS()
26 #undef x
27         NULL
28 };
29
30 void bch2_recalc_btree_reserve(struct bch_fs *c)
31 {
32         unsigned i, reserve = 16;
33
34         if (!c->btree_roots[0].b)
35                 reserve += 8;
36
37         for (i = 0; i < BTREE_ID_NR; i++)
38                 if (c->btree_roots[i].b)
39                         reserve += min_t(unsigned, 1,
40                                          c->btree_roots[i].b->c.level) * 8;
41
42         c->btree_cache.reserve = reserve;
43 }
44
45 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
46 {
47         return max_t(int, 0, bc->used - bc->reserve);
48 }
49
50 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
51 {
52         if (b->c.lock.readers)
53                 list_move(&b->list, &bc->freed_pcpu);
54         else
55                 list_move(&b->list, &bc->freed_nonpcpu);
56 }
57
58 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
59 {
60         struct btree_cache *bc = &c->btree_cache;
61
62         EBUG_ON(btree_node_write_in_flight(b));
63
64         kvpfree(b->data, btree_bytes(c));
65         b->data = NULL;
66 #ifdef __KERNEL__
67         vfree(b->aux_data);
68 #else
69         munmap(b->aux_data, btree_aux_data_bytes(b));
70 #endif
71         b->aux_data = NULL;
72
73         bc->used--;
74
75         btree_node_to_freedlist(bc, b);
76 }
77
78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
79                                    const void *obj)
80 {
81         const struct btree *b = obj;
82         const u64 *v = arg->key;
83
84         return b->hash_val == *v ? 0 : 1;
85 }
86
87 static const struct rhashtable_params bch_btree_cache_params = {
88         .head_offset    = offsetof(struct btree, hash),
89         .key_offset     = offsetof(struct btree, hash_val),
90         .key_len        = sizeof(u64),
91         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
92 };
93
94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
95 {
96         BUG_ON(b->data || b->aux_data);
97
98         b->data = kvpmalloc(btree_bytes(c), gfp);
99         if (!b->data)
100                 return -ENOMEM;
101 #ifdef __KERNEL__
102         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
103 #else
104         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
105                            PROT_READ|PROT_WRITE|PROT_EXEC,
106                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
107         if (b->aux_data == MAP_FAILED)
108                 b->aux_data = NULL;
109 #endif
110         if (!b->aux_data) {
111                 kvpfree(b->data, btree_bytes(c));
112                 b->data = NULL;
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
120 {
121         struct btree *b;
122
123         b = kzalloc(sizeof(struct btree), gfp);
124         if (!b)
125                 return NULL;
126
127         bkey_btree_ptr_init(&b->key);
128         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
129 #ifdef CONFIG_DEBUG_LOCK_ALLOC
130         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
131 #endif
132         INIT_LIST_HEAD(&b->list);
133         INIT_LIST_HEAD(&b->write_blocked);
134         b->byte_order = ilog2(btree_bytes(c));
135         return b;
136 }
137
138 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
139 {
140         struct btree_cache *bc = &c->btree_cache;
141         struct btree *b;
142
143         b = __btree_node_mem_alloc(c, GFP_KERNEL);
144         if (!b)
145                 return NULL;
146
147         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
148                 kfree(b);
149                 return NULL;
150         }
151
152         bc->used++;
153         list_add(&b->list, &bc->freeable);
154         return b;
155 }
156
157 /* Btree in memory cache - hash table */
158
159 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
160 {
161         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
162
163         BUG_ON(ret);
164
165         /* Cause future lookups for this node to fail: */
166         b->hash_val = 0;
167 }
168
169 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
170 {
171         BUG_ON(b->hash_val);
172         b->hash_val = btree_ptr_hash_val(&b->key);
173
174         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
175                                              bch_btree_cache_params);
176 }
177
178 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
179                                 unsigned level, enum btree_id id)
180 {
181         int ret;
182
183         b->c.level      = level;
184         b->c.btree_id   = id;
185
186         mutex_lock(&bc->lock);
187         ret = __bch2_btree_node_hash_insert(bc, b);
188         if (!ret)
189                 list_add_tail(&b->list, &bc->live);
190         mutex_unlock(&bc->lock);
191
192         return ret;
193 }
194
195 __flatten
196 static inline struct btree *btree_cache_find(struct btree_cache *bc,
197                                      const struct bkey_i *k)
198 {
199         u64 v = btree_ptr_hash_val(k);
200
201         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
202 }
203
204 /*
205  * this version is for btree nodes that have already been freed (we're not
206  * reaping a real btree node)
207  */
208 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
209 {
210         struct btree_cache *bc = &c->btree_cache;
211         int ret = 0;
212
213         lockdep_assert_held(&bc->lock);
214 wait_on_io:
215         if (b->flags & ((1U << BTREE_NODE_dirty)|
216                         (1U << BTREE_NODE_read_in_flight)|
217                         (1U << BTREE_NODE_write_in_flight))) {
218                 if (!flush) {
219                         if (btree_node_dirty(b))
220                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
221                         else if (btree_node_read_in_flight(b))
222                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
223                         else if (btree_node_write_in_flight(b))
224                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
225                         return -ENOMEM;
226                 }
227
228                 /* XXX: waiting on IO with btree cache lock held */
229                 bch2_btree_node_wait_on_read(b);
230                 bch2_btree_node_wait_on_write(b);
231         }
232
233         if (!six_trylock_intent(&b->c.lock)) {
234                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
235                 return -ENOMEM;
236         }
237
238         if (!six_trylock_write(&b->c.lock)) {
239                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
240                 goto out_unlock_intent;
241         }
242
243         /* recheck under lock */
244         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
245                         (1U << BTREE_NODE_write_in_flight))) {
246                 if (!flush) {
247                         if (btree_node_read_in_flight(b))
248                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
249                         else if (btree_node_write_in_flight(b))
250                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
251                         goto out_unlock;
252                 }
253                 six_unlock_write(&b->c.lock);
254                 six_unlock_intent(&b->c.lock);
255                 goto wait_on_io;
256         }
257
258         if (btree_node_noevict(b)) {
259                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
260                 goto out_unlock;
261         }
262         if (btree_node_write_blocked(b)) {
263                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
264                 goto out_unlock;
265         }
266         if (btree_node_will_make_reachable(b)) {
267                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
268                 goto out_unlock;
269         }
270
271         if (btree_node_dirty(b)) {
272                 if (!flush) {
273                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
274                         goto out_unlock;
275                 }
276                 /*
277                  * Using the underscore version because we don't want to compact
278                  * bsets after the write, since this node is about to be evicted
279                  * - unless btree verify mode is enabled, since it runs out of
280                  * the post write cleanup:
281                  */
282                 if (bch2_verify_btree_ondisk)
283                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
284                                               BTREE_WRITE_cache_reclaim);
285                 else
286                         __bch2_btree_node_write(c, b,
287                                                 BTREE_WRITE_cache_reclaim);
288
289                 six_unlock_write(&b->c.lock);
290                 six_unlock_intent(&b->c.lock);
291                 goto wait_on_io;
292         }
293 out:
294         if (b->hash_val && !ret)
295                 trace_and_count(c, btree_cache_reap, c, b);
296         return ret;
297 out_unlock:
298         six_unlock_write(&b->c.lock);
299 out_unlock_intent:
300         six_unlock_intent(&b->c.lock);
301         ret = -ENOMEM;
302         goto out;
303 }
304
305 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
306 {
307         return __btree_node_reclaim(c, b, false, shrinker_counter);
308 }
309
310 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
311 {
312         return __btree_node_reclaim(c, b, true, false);
313 }
314
315 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
316                                            struct shrink_control *sc)
317 {
318         struct bch_fs *c = container_of(shrink, struct bch_fs,
319                                         btree_cache.shrink);
320         struct btree_cache *bc = &c->btree_cache;
321         struct btree *b, *t;
322         unsigned long nr = sc->nr_to_scan;
323         unsigned long can_free = 0;
324         unsigned long freed = 0;
325         unsigned long touched = 0;
326         unsigned i, flags;
327         unsigned long ret = SHRINK_STOP;
328         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
329                 bc->used * 3 / 4;
330
331         if (bch2_btree_shrinker_disabled)
332                 return SHRINK_STOP;
333
334         mutex_lock(&bc->lock);
335         flags = memalloc_nofs_save();
336
337         /*
338          * It's _really_ critical that we don't free too many btree nodes - we
339          * have to always leave ourselves a reserve. The reserve is how we
340          * guarantee that allocating memory for a new btree node can always
341          * succeed, so that inserting keys into the btree can always succeed and
342          * IO can always make forward progress:
343          */
344         can_free = btree_cache_can_free(bc);
345         nr = min_t(unsigned long, nr, can_free);
346
347         i = 0;
348         list_for_each_entry_safe(b, t, &bc->freeable, list) {
349                 /*
350                  * Leave a few nodes on the freeable list, so that a btree split
351                  * won't have to hit the system allocator:
352                  */
353                 if (++i <= 3)
354                         continue;
355
356                 touched++;
357
358                 if (touched >= nr)
359                         goto out;
360
361                 if (!btree_node_reclaim(c, b, true)) {
362                         btree_node_data_free(c, b);
363                         six_unlock_write(&b->c.lock);
364                         six_unlock_intent(&b->c.lock);
365                         freed++;
366                         bc->freed++;
367                 }
368         }
369 restart:
370         list_for_each_entry_safe(b, t, &bc->live, list) {
371                 touched++;
372
373                 if (btree_node_accessed(b)) {
374                         clear_btree_node_accessed(b);
375                         bc->not_freed_access_bit++;
376                 } else if (!btree_node_reclaim(c, b, true)) {
377                         freed++;
378                         btree_node_data_free(c, b);
379                         bc->freed++;
380
381                         bch2_btree_node_hash_remove(bc, b);
382                         six_unlock_write(&b->c.lock);
383                         six_unlock_intent(&b->c.lock);
384
385                         if (freed == nr)
386                                 goto out_rotate;
387                 } else if (trigger_writes &&
388                            btree_node_dirty(b) &&
389                            !btree_node_will_make_reachable(b) &&
390                            !btree_node_write_blocked(b) &&
391                            six_trylock_read(&b->c.lock)) {
392                         list_move(&bc->live, &b->list);
393                         mutex_unlock(&bc->lock);
394                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
395                         six_unlock_read(&b->c.lock);
396                         if (touched >= nr)
397                                 goto out_nounlock;
398                         mutex_lock(&bc->lock);
399                         goto restart;
400                 }
401
402                 if (touched >= nr)
403                         break;
404         }
405 out_rotate:
406         if (&t->list != &bc->live)
407                 list_move_tail(&bc->live, &t->list);
408 out:
409         mutex_unlock(&bc->lock);
410 out_nounlock:
411         ret = freed;
412         memalloc_nofs_restore(flags);
413         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
414         return ret;
415 }
416
417 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
418                                             struct shrink_control *sc)
419 {
420         struct bch_fs *c = container_of(shrink, struct bch_fs,
421                                         btree_cache.shrink);
422         struct btree_cache *bc = &c->btree_cache;
423
424         if (bch2_btree_shrinker_disabled)
425                 return 0;
426
427         return btree_cache_can_free(bc);
428 }
429
430 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
431 {
432         struct bch_fs *c = container_of(shrink, struct bch_fs,
433                                         btree_cache.shrink);
434
435         bch2_btree_cache_to_text(out, &c->btree_cache);
436 }
437
438 void bch2_fs_btree_cache_exit(struct bch_fs *c)
439 {
440         struct btree_cache *bc = &c->btree_cache;
441         struct btree *b;
442         unsigned i, flags;
443
444         if (bc->shrink.list.next)
445                 unregister_shrinker(&bc->shrink);
446
447         /* vfree() can allocate memory: */
448         flags = memalloc_nofs_save();
449         mutex_lock(&bc->lock);
450
451         if (c->verify_data)
452                 list_move(&c->verify_data->list, &bc->live);
453
454         kvpfree(c->verify_ondisk, btree_bytes(c));
455
456         for (i = 0; i < BTREE_ID_NR; i++)
457                 if (c->btree_roots[i].b)
458                         list_add(&c->btree_roots[i].b->list, &bc->live);
459
460         list_splice(&bc->freeable, &bc->live);
461
462         while (!list_empty(&bc->live)) {
463                 b = list_first_entry(&bc->live, struct btree, list);
464
465                 BUG_ON(btree_node_read_in_flight(b) ||
466                        btree_node_write_in_flight(b));
467
468                 if (btree_node_dirty(b))
469                         bch2_btree_complete_write(c, b, btree_current_write(b));
470                 clear_btree_node_dirty_acct(c, b);
471
472                 btree_node_data_free(c, b);
473         }
474
475         BUG_ON(atomic_read(&c->btree_cache.dirty));
476
477         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
478
479         while (!list_empty(&bc->freed_nonpcpu)) {
480                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
481                 list_del(&b->list);
482                 six_lock_pcpu_free(&b->c.lock);
483                 kfree(b);
484         }
485
486         mutex_unlock(&bc->lock);
487         memalloc_nofs_restore(flags);
488
489         if (bc->table_init_done)
490                 rhashtable_destroy(&bc->table);
491 }
492
493 int bch2_fs_btree_cache_init(struct bch_fs *c)
494 {
495         struct btree_cache *bc = &c->btree_cache;
496         unsigned i;
497         int ret = 0;
498
499         pr_verbose_init(c->opts, "");
500
501         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
502         if (ret)
503                 goto out;
504
505         bc->table_init_done = true;
506
507         bch2_recalc_btree_reserve(c);
508
509         for (i = 0; i < bc->reserve; i++)
510                 if (!__bch2_btree_node_mem_alloc(c)) {
511                         ret = -ENOMEM;
512                         goto out;
513                 }
514
515         list_splice_init(&bc->live, &bc->freeable);
516
517         mutex_init(&c->verify_lock);
518
519         bc->shrink.count_objects        = bch2_btree_cache_count;
520         bc->shrink.scan_objects         = bch2_btree_cache_scan;
521         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
522         bc->shrink.seeks                = 4;
523         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
524 out:
525         pr_verbose_init(c->opts, "ret %i", ret);
526         return ret;
527 }
528
529 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
530 {
531         mutex_init(&bc->lock);
532         INIT_LIST_HEAD(&bc->live);
533         INIT_LIST_HEAD(&bc->freeable);
534         INIT_LIST_HEAD(&bc->freed_pcpu);
535         INIT_LIST_HEAD(&bc->freed_nonpcpu);
536 }
537
538 /*
539  * We can only have one thread cannibalizing other cached btree nodes at a time,
540  * or we'll deadlock. We use an open coded mutex to ensure that, which a
541  * cannibalize_bucket() will take. This means every time we unlock the root of
542  * the btree, we need to release this lock if we have it held.
543  */
544 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
545 {
546         struct btree_cache *bc = &c->btree_cache;
547
548         if (bc->alloc_lock == current) {
549                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
550                 bc->alloc_lock = NULL;
551                 closure_wake_up(&bc->alloc_wait);
552         }
553 }
554
555 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
556 {
557         struct btree_cache *bc = &c->btree_cache;
558         struct task_struct *old;
559
560         old = cmpxchg(&bc->alloc_lock, NULL, current);
561         if (old == NULL || old == current)
562                 goto success;
563
564         if (!cl) {
565                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
566                 return -ENOMEM;
567         }
568
569         closure_wait(&bc->alloc_wait, cl);
570
571         /* Try again, after adding ourselves to waitlist */
572         old = cmpxchg(&bc->alloc_lock, NULL, current);
573         if (old == NULL || old == current) {
574                 /* We raced */
575                 closure_wake_up(&bc->alloc_wait);
576                 goto success;
577         }
578
579         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
580         return -EAGAIN;
581
582 success:
583         trace_and_count(c, btree_cache_cannibalize_lock, c);
584         return 0;
585 }
586
587 static struct btree *btree_node_cannibalize(struct bch_fs *c)
588 {
589         struct btree_cache *bc = &c->btree_cache;
590         struct btree *b;
591
592         list_for_each_entry_reverse(b, &bc->live, list)
593                 if (!btree_node_reclaim(c, b, false))
594                         return b;
595
596         while (1) {
597                 list_for_each_entry_reverse(b, &bc->live, list)
598                         if (!btree_node_write_and_reclaim(c, b))
599                                 return b;
600
601                 /*
602                  * Rare case: all nodes were intent-locked.
603                  * Just busy-wait.
604                  */
605                 WARN_ONCE(1, "btree cache cannibalize failed\n");
606                 cond_resched();
607         }
608 }
609
610 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
611 {
612         struct btree_cache *bc = &c->btree_cache;
613         struct list_head *freed = pcpu_read_locks
614                 ? &bc->freed_pcpu
615                 : &bc->freed_nonpcpu;
616         struct btree *b, *b2;
617         u64 start_time = local_clock();
618         unsigned flags;
619
620         flags = memalloc_nofs_save();
621         mutex_lock(&bc->lock);
622
623         /*
624          * We never free struct btree itself, just the memory that holds the on
625          * disk node. Check the freed list before allocating a new one:
626          */
627         list_for_each_entry(b, freed, list)
628                 if (!btree_node_reclaim(c, b, false)) {
629                         list_del_init(&b->list);
630                         goto got_node;
631                 }
632
633         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
634         if (!b) {
635                 mutex_unlock(&bc->lock);
636                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
637                 if (!b)
638                         goto err;
639                 mutex_lock(&bc->lock);
640         }
641
642         if (pcpu_read_locks)
643                 six_lock_pcpu_alloc(&b->c.lock);
644
645         BUG_ON(!six_trylock_intent(&b->c.lock));
646         BUG_ON(!six_trylock_write(&b->c.lock));
647 got_node:
648
649         /*
650          * btree_free() doesn't free memory; it sticks the node on the end of
651          * the list. Check if there's any freed nodes there:
652          */
653         list_for_each_entry(b2, &bc->freeable, list)
654                 if (!btree_node_reclaim(c, b2, false)) {
655                         swap(b->data, b2->data);
656                         swap(b->aux_data, b2->aux_data);
657                         btree_node_to_freedlist(bc, b2);
658                         six_unlock_write(&b2->c.lock);
659                         six_unlock_intent(&b2->c.lock);
660                         goto got_mem;
661                 }
662
663         mutex_unlock(&bc->lock);
664
665         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
666                 goto err;
667
668         mutex_lock(&bc->lock);
669         bc->used++;
670 got_mem:
671         mutex_unlock(&bc->lock);
672
673         BUG_ON(btree_node_hashed(b));
674         BUG_ON(btree_node_dirty(b));
675         BUG_ON(btree_node_write_in_flight(b));
676 out:
677         b->flags                = 0;
678         b->written              = 0;
679         b->nsets                = 0;
680         b->sib_u64s[0]          = 0;
681         b->sib_u64s[1]          = 0;
682         b->whiteout_u64s        = 0;
683         bch2_btree_keys_init(b);
684         set_btree_node_accessed(b);
685
686         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
687                                start_time);
688
689         memalloc_nofs_restore(flags);
690         return b;
691 err:
692         mutex_lock(&bc->lock);
693
694         /* Try to cannibalize another cached btree node: */
695         if (bc->alloc_lock == current) {
696                 b2 = btree_node_cannibalize(c);
697                 bch2_btree_node_hash_remove(bc, b2);
698
699                 if (b) {
700                         swap(b->data, b2->data);
701                         swap(b->aux_data, b2->aux_data);
702                         btree_node_to_freedlist(bc, b2);
703                         six_unlock_write(&b2->c.lock);
704                         six_unlock_intent(&b2->c.lock);
705                 } else {
706                         b = b2;
707                         list_del_init(&b->list);
708                 }
709
710                 mutex_unlock(&bc->lock);
711
712                 trace_and_count(c, btree_cache_cannibalize, c);
713                 goto out;
714         }
715
716         mutex_unlock(&bc->lock);
717         memalloc_nofs_restore(flags);
718         return ERR_PTR(-ENOMEM);
719 }
720
721 /* Slowpath, don't want it inlined into btree_iter_traverse() */
722 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
723                                 struct btree_trans *trans,
724                                 struct btree_path *path,
725                                 const struct bkey_i *k,
726                                 enum btree_id btree_id,
727                                 unsigned level,
728                                 enum six_lock_type lock_type,
729                                 bool sync)
730 {
731         struct btree_cache *bc = &c->btree_cache;
732         struct btree *b;
733         u32 seq;
734
735         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
736         /*
737          * Parent node must be locked, else we could read in a btree node that's
738          * been freed:
739          */
740         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
741                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
742                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
743         }
744
745         b = bch2_btree_node_mem_alloc(c, level != 0);
746
747         if (trans && b == ERR_PTR(-ENOMEM)) {
748                 trans->memory_allocation_failure = true;
749                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
750                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
751         }
752
753         if (IS_ERR(b))
754                 return b;
755
756         bkey_copy(&b->key, k);
757         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
758                 /* raced with another fill: */
759
760                 /* mark as unhashed... */
761                 b->hash_val = 0;
762
763                 mutex_lock(&bc->lock);
764                 list_add(&b->list, &bc->freeable);
765                 mutex_unlock(&bc->lock);
766
767                 six_unlock_write(&b->c.lock);
768                 six_unlock_intent(&b->c.lock);
769                 return NULL;
770         }
771
772         set_btree_node_read_in_flight(b);
773
774         six_unlock_write(&b->c.lock);
775         seq = b->c.lock.state.seq;
776         six_unlock_intent(&b->c.lock);
777
778         /* Unlock before doing IO: */
779         if (trans && sync)
780                 bch2_trans_unlock(trans);
781
782         bch2_btree_node_read(c, b, sync);
783
784         if (!sync)
785                 return NULL;
786
787         if (trans) {
788                 int ret = bch2_trans_relock(trans) ?:
789                         bch2_btree_path_relock_intent(trans, path);
790                 if (ret) {
791                         BUG_ON(!trans->restarted);
792                         return ERR_PTR(ret);
793                 }
794         }
795
796         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
797                 if (trans)
798                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
799                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
800         }
801
802         return b;
803 }
804
805 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
806 {
807         struct printbuf buf = PRINTBUF;
808
809         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
810                 return;
811
812         prt_printf(&buf,
813                "btree node header doesn't match ptr\n"
814                "btree %s level %u\n"
815                "ptr: ",
816                bch2_btree_ids[b->c.btree_id], b->c.level);
817         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
818
819         prt_printf(&buf, "\nheader: btree %s level %llu\n"
820                "min ",
821                bch2_btree_ids[BTREE_NODE_ID(b->data)],
822                BTREE_NODE_LEVEL(b->data));
823         bch2_bpos_to_text(&buf, b->data->min_key);
824
825         prt_printf(&buf, "\nmax ");
826         bch2_bpos_to_text(&buf, b->data->max_key);
827
828         bch2_fs_inconsistent(c, "%s", buf.buf);
829         printbuf_exit(&buf);
830 }
831
832 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
833 {
834         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
835             b->c.level != BTREE_NODE_LEVEL(b->data) ||
836             bpos_cmp(b->data->max_key, b->key.k.p) ||
837             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
838              bpos_cmp(b->data->min_key,
839                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
840                 btree_bad_header(c, b);
841 }
842
843 /**
844  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
845  * in from disk if necessary.
846  *
847  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
848  *
849  * The btree node will have either a read or a write lock held, depending on
850  * the @write parameter.
851  */
852 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
853                                   const struct bkey_i *k, unsigned level,
854                                   enum six_lock_type lock_type,
855                                   unsigned long trace_ip)
856 {
857         struct bch_fs *c = trans->c;
858         struct btree_cache *bc = &c->btree_cache;
859         struct btree *b;
860         struct bset_tree *t;
861         int ret;
862
863         EBUG_ON(level >= BTREE_MAX_DEPTH);
864
865         b = btree_node_mem_ptr(k);
866
867         /*
868          * Check b->hash_val _before_ calling btree_node_lock() - this might not
869          * be the node we want anymore, and trying to lock the wrong node could
870          * cause an unneccessary transaction restart:
871          */
872         if (likely(c->opts.btree_node_mem_ptr_optimization &&
873                    b &&
874                    b->hash_val == btree_ptr_hash_val(k)))
875                 goto lock_node;
876 retry:
877         b = btree_cache_find(bc, k);
878         if (unlikely(!b)) {
879                 /*
880                  * We must have the parent locked to call bch2_btree_node_fill(),
881                  * else we could read in a btree node from disk that's been
882                  * freed:
883                  */
884                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
885                                          level, lock_type, true);
886
887                 /* We raced and found the btree node in the cache */
888                 if (!b)
889                         goto retry;
890
891                 if (IS_ERR(b))
892                         return b;
893         } else {
894 lock_node:
895                 /*
896                  * There's a potential deadlock with splits and insertions into
897                  * interior nodes we have to avoid:
898                  *
899                  * The other thread might be holding an intent lock on the node
900                  * we want, and they want to update its parent node so they're
901                  * going to upgrade their intent lock on the parent node to a
902                  * write lock.
903                  *
904                  * But if we're holding a read lock on the parent, and we're
905                  * trying to get the intent lock they're holding, we deadlock.
906                  *
907                  * So to avoid this we drop the read locks on parent nodes when
908                  * we're starting to take intent locks - and handle the race.
909                  *
910                  * The race is that they might be about to free the node we
911                  * want, and dropping our read lock on the parent node lets them
912                  * update the parent marking the node we want as freed, and then
913                  * free it:
914                  *
915                  * To guard against this, btree nodes are evicted from the cache
916                  * when they're freed - and b->hash_val is zeroed out, which we
917                  * check for after we lock the node.
918                  *
919                  * Then, bch2_btree_node_relock() on the parent will fail - because
920                  * the parent was modified, when the pointer to the node we want
921                  * was removed - and we'll bail out:
922                  */
923                 if (btree_node_read_locked(path, level + 1))
924                         btree_node_unlock(trans, path, level + 1);
925
926                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
927                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
928                         return ERR_PTR(ret);
929
930                 BUG_ON(ret);
931
932                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
933                              b->c.level != level ||
934                              race_fault())) {
935                         six_unlock_type(&b->c.lock, lock_type);
936                         if (bch2_btree_node_relock(trans, path, level + 1))
937                                 goto retry;
938
939                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
940                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
941                 }
942         }
943
944         if (unlikely(btree_node_read_in_flight(b))) {
945                 u32 seq = b->c.lock.state.seq;
946
947                 six_unlock_type(&b->c.lock, lock_type);
948                 bch2_trans_unlock(trans);
949
950                 bch2_btree_node_wait_on_read(b);
951
952                 /*
953                  * should_be_locked is not set on this path yet, so we need to
954                  * relock it specifically:
955                  */
956                 if (trans) {
957                         int ret = bch2_trans_relock(trans) ?:
958                                 bch2_btree_path_relock_intent(trans, path);
959                         if (ret) {
960                                 BUG_ON(!trans->restarted);
961                                 return ERR_PTR(ret);
962                         }
963                 }
964
965                 if (!six_relock_type(&b->c.lock, lock_type, seq))
966                         goto retry;
967         }
968
969         prefetch(b->aux_data);
970
971         for_each_bset(b, t) {
972                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
973
974                 prefetch(p + L1_CACHE_BYTES * 0);
975                 prefetch(p + L1_CACHE_BYTES * 1);
976                 prefetch(p + L1_CACHE_BYTES * 2);
977         }
978
979         /* avoid atomic set bit if it's not needed: */
980         if (!btree_node_accessed(b))
981                 set_btree_node_accessed(b);
982
983         if (unlikely(btree_node_read_error(b))) {
984                 six_unlock_type(&b->c.lock, lock_type);
985                 return ERR_PTR(-EIO);
986         }
987
988         EBUG_ON(b->c.btree_id != path->btree_id);
989         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
990         btree_check_header(c, b);
991
992         return b;
993 }
994
995 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
996                                          const struct bkey_i *k,
997                                          enum btree_id btree_id,
998                                          unsigned level,
999                                          bool nofill)
1000 {
1001         struct bch_fs *c = trans->c;
1002         struct btree_cache *bc = &c->btree_cache;
1003         struct btree *b;
1004         struct bset_tree *t;
1005         int ret;
1006
1007         EBUG_ON(level >= BTREE_MAX_DEPTH);
1008
1009         if (c->opts.btree_node_mem_ptr_optimization) {
1010                 b = btree_node_mem_ptr(k);
1011                 if (b)
1012                         goto lock_node;
1013         }
1014 retry:
1015         b = btree_cache_find(bc, k);
1016         if (unlikely(!b)) {
1017                 if (nofill)
1018                         goto out;
1019
1020                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1021                                          level, SIX_LOCK_read, true);
1022
1023                 /* We raced and found the btree node in the cache */
1024                 if (!b)
1025                         goto retry;
1026
1027                 if (IS_ERR(b) &&
1028                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1029                         goto retry;
1030
1031                 if (IS_ERR(b))
1032                         goto out;
1033         } else {
1034 lock_node:
1035                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
1036                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1037                         return ERR_PTR(ret);
1038
1039                 BUG_ON(ret);
1040
1041                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1042                              b->c.btree_id != btree_id ||
1043                              b->c.level != level)) {
1044                         six_unlock_read(&b->c.lock);
1045                         goto retry;
1046                 }
1047         }
1048
1049         /* XXX: waiting on IO with btree locks held: */
1050         __bch2_btree_node_wait_on_read(b);
1051
1052         prefetch(b->aux_data);
1053
1054         for_each_bset(b, t) {
1055                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1056
1057                 prefetch(p + L1_CACHE_BYTES * 0);
1058                 prefetch(p + L1_CACHE_BYTES * 1);
1059                 prefetch(p + L1_CACHE_BYTES * 2);
1060         }
1061
1062         /* avoid atomic set bit if it's not needed: */
1063         if (!btree_node_accessed(b))
1064                 set_btree_node_accessed(b);
1065
1066         if (unlikely(btree_node_read_error(b))) {
1067                 six_unlock_read(&b->c.lock);
1068                 b = ERR_PTR(-EIO);
1069                 goto out;
1070         }
1071
1072         EBUG_ON(b->c.btree_id != btree_id);
1073         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1074         btree_check_header(c, b);
1075 out:
1076         bch2_btree_cache_cannibalize_unlock(c);
1077         return b;
1078 }
1079
1080 int bch2_btree_node_prefetch(struct bch_fs *c,
1081                              struct btree_trans *trans,
1082                              struct btree_path *path,
1083                              const struct bkey_i *k,
1084                              enum btree_id btree_id, unsigned level)
1085 {
1086         struct btree_cache *bc = &c->btree_cache;
1087         struct btree *b;
1088
1089         BUG_ON(trans && !btree_node_locked(path, level + 1));
1090         BUG_ON(level >= BTREE_MAX_DEPTH);
1091
1092         b = btree_cache_find(bc, k);
1093         if (b)
1094                 return 0;
1095
1096         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1097                                  level, SIX_LOCK_read, false);
1098         return PTR_ERR_OR_ZERO(b);
1099 }
1100
1101 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1102 {
1103         struct bch_fs *c = trans->c;
1104         struct btree_cache *bc = &c->btree_cache;
1105         struct btree *b;
1106
1107         b = btree_cache_find(bc, k);
1108         if (!b)
1109                 return;
1110 wait_on_io:
1111         /* not allowed to wait on io with btree locks held: */
1112
1113         /* XXX we're called from btree_gc which will be holding other btree
1114          * nodes locked
1115          */
1116         __bch2_btree_node_wait_on_read(b);
1117         __bch2_btree_node_wait_on_write(b);
1118
1119         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1120         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1121
1122         if (btree_node_dirty(b)) {
1123                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1124                 six_unlock_write(&b->c.lock);
1125                 six_unlock_intent(&b->c.lock);
1126                 goto wait_on_io;
1127         }
1128
1129         BUG_ON(btree_node_dirty(b));
1130
1131         mutex_lock(&bc->lock);
1132         btree_node_data_free(c, b);
1133         bch2_btree_node_hash_remove(bc, b);
1134         mutex_unlock(&bc->lock);
1135
1136         six_unlock_write(&b->c.lock);
1137         six_unlock_intent(&b->c.lock);
1138 }
1139
1140 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1141                              struct btree *b)
1142 {
1143         const struct bkey_format *f = &b->format;
1144         struct bset_stats stats;
1145
1146         memset(&stats, 0, sizeof(stats));
1147
1148         bch2_btree_keys_stats(b, &stats);
1149
1150         prt_printf(out, "l %u ", b->c.level);
1151         bch2_bpos_to_text(out, b->data->min_key);
1152         prt_printf(out, " - ");
1153         bch2_bpos_to_text(out, b->data->max_key);
1154         prt_printf(out, ":\n"
1155                "    ptrs: ");
1156         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1157
1158         prt_printf(out, "\n"
1159                "    format: u64s %u fields %u %u %u %u %u\n"
1160                "    unpack fn len: %u\n"
1161                "    bytes used %zu/%zu (%zu%% full)\n"
1162                "    sib u64s: %u, %u (merge threshold %u)\n"
1163                "    nr packed keys %u\n"
1164                "    nr unpacked keys %u\n"
1165                "    floats %zu\n"
1166                "    failed unpacked %zu\n",
1167                f->key_u64s,
1168                f->bits_per_field[0],
1169                f->bits_per_field[1],
1170                f->bits_per_field[2],
1171                f->bits_per_field[3],
1172                f->bits_per_field[4],
1173                b->unpack_fn_len,
1174                b->nr.live_u64s * sizeof(u64),
1175                btree_bytes(c) - sizeof(struct btree_node),
1176                b->nr.live_u64s * 100 / btree_max_u64s(c),
1177                b->sib_u64s[0],
1178                b->sib_u64s[1],
1179                c->btree_foreground_merge_threshold,
1180                b->nr.packed_keys,
1181                b->nr.unpacked_keys,
1182                stats.floats,
1183                stats.failed);
1184 }
1185
1186 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1187 {
1188         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1189         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1190         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1191
1192         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1193         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1194         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1195         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1196         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1197         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1198         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1199         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1200         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1201         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1202
1203 }