]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 3856459b1b bcachefs: bch2_btree_iter_peek_node_and_restart()
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <linux/seq_buf.h>
16 #include <trace/events/bcachefs.h>
17
18 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
19 do {                                             \
20         if (shrinker_counter)                    \
21                 bc->not_freed_##counter++;       \
22 } while (0)
23
24 const char * const bch2_btree_node_flags[] = {
25 #define x(f)    #f,
26         BTREE_FLAGS()
27 #undef x
28         NULL
29 };
30
31 void bch2_recalc_btree_reserve(struct bch_fs *c)
32 {
33         unsigned i, reserve = 16;
34
35         if (!c->btree_roots[0].b)
36                 reserve += 8;
37
38         for (i = 0; i < BTREE_ID_NR; i++)
39                 if (c->btree_roots[i].b)
40                         reserve += min_t(unsigned, 1,
41                                          c->btree_roots[i].b->c.level) * 8;
42
43         c->btree_cache.reserve = reserve;
44 }
45
46 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
47 {
48         return max_t(int, 0, bc->used - bc->reserve);
49 }
50
51 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
52 {
53         if (b->c.lock.readers)
54                 list_move(&b->list, &bc->freed_pcpu);
55         else
56                 list_move(&b->list, &bc->freed_nonpcpu);
57 }
58
59 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
60 {
61         struct btree_cache *bc = &c->btree_cache;
62
63         EBUG_ON(btree_node_write_in_flight(b));
64
65         kvpfree(b->data, btree_bytes(c));
66         b->data = NULL;
67 #ifdef __KERNEL__
68         vfree(b->aux_data);
69 #else
70         munmap(b->aux_data, btree_aux_data_bytes(b));
71 #endif
72         b->aux_data = NULL;
73
74         bc->used--;
75
76         btree_node_to_freedlist(bc, b);
77 }
78
79 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
80                                    const void *obj)
81 {
82         const struct btree *b = obj;
83         const u64 *v = arg->key;
84
85         return b->hash_val == *v ? 0 : 1;
86 }
87
88 static const struct rhashtable_params bch_btree_cache_params = {
89         .head_offset    = offsetof(struct btree, hash),
90         .key_offset     = offsetof(struct btree, hash_val),
91         .key_len        = sizeof(u64),
92         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
93 };
94
95 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
96 {
97         BUG_ON(b->data || b->aux_data);
98
99         b->data = kvpmalloc(btree_bytes(c), gfp);
100         if (!b->data)
101                 return -ENOMEM;
102 #ifdef __KERNEL__
103         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
104 #else
105         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
106                            PROT_READ|PROT_WRITE|PROT_EXEC,
107                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
108         if (b->aux_data == MAP_FAILED)
109                 b->aux_data = NULL;
110 #endif
111         if (!b->aux_data) {
112                 kvpfree(b->data, btree_bytes(c));
113                 b->data = NULL;
114                 return -ENOMEM;
115         }
116
117         return 0;
118 }
119
120 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
121 {
122         struct btree *b;
123
124         b = kzalloc(sizeof(struct btree), gfp);
125         if (!b)
126                 return NULL;
127
128         bkey_btree_ptr_init(&b->key);
129         bch2_btree_lock_init(&b->c);
130 #ifdef CONFIG_DEBUG_LOCK_ALLOC
131         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
132 #endif
133         INIT_LIST_HEAD(&b->list);
134         INIT_LIST_HEAD(&b->write_blocked);
135         b->byte_order = ilog2(btree_bytes(c));
136         return b;
137 }
138
139 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
140 {
141         struct btree_cache *bc = &c->btree_cache;
142         struct btree *b;
143
144         b = __btree_node_mem_alloc(c, GFP_KERNEL);
145         if (!b)
146                 return NULL;
147
148         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
149                 kfree(b);
150                 return NULL;
151         }
152
153         bc->used++;
154         list_add(&b->list, &bc->freeable);
155         return b;
156 }
157
158 /* Btree in memory cache - hash table */
159
160 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
161 {
162         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
163
164         BUG_ON(ret);
165
166         /* Cause future lookups for this node to fail: */
167         b->hash_val = 0;
168 }
169
170 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
171 {
172         BUG_ON(b->hash_val);
173         b->hash_val = btree_ptr_hash_val(&b->key);
174
175         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
176                                              bch_btree_cache_params);
177 }
178
179 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
180                                 unsigned level, enum btree_id id)
181 {
182         int ret;
183
184         b->c.level      = level;
185         b->c.btree_id   = id;
186
187         mutex_lock(&bc->lock);
188         ret = __bch2_btree_node_hash_insert(bc, b);
189         if (!ret)
190                 list_add_tail(&b->list, &bc->live);
191         mutex_unlock(&bc->lock);
192
193         return ret;
194 }
195
196 __flatten
197 static inline struct btree *btree_cache_find(struct btree_cache *bc,
198                                      const struct bkey_i *k)
199 {
200         u64 v = btree_ptr_hash_val(k);
201
202         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
203 }
204
205 /*
206  * this version is for btree nodes that have already been freed (we're not
207  * reaping a real btree node)
208  */
209 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
210 {
211         struct btree_cache *bc = &c->btree_cache;
212         int ret = 0;
213
214         lockdep_assert_held(&bc->lock);
215 wait_on_io:
216         if (b->flags & ((1U << BTREE_NODE_dirty)|
217                         (1U << BTREE_NODE_read_in_flight)|
218                         (1U << BTREE_NODE_write_in_flight))) {
219                 if (!flush) {
220                         if (btree_node_dirty(b))
221                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
222                         else if (btree_node_read_in_flight(b))
223                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
224                         else if (btree_node_write_in_flight(b))
225                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
226                         return -ENOMEM;
227                 }
228
229                 /* XXX: waiting on IO with btree cache lock held */
230                 bch2_btree_node_wait_on_read(b);
231                 bch2_btree_node_wait_on_write(b);
232         }
233
234         if (!six_trylock_intent(&b->c.lock)) {
235                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
236                 return -ENOMEM;
237         }
238
239         if (!six_trylock_write(&b->c.lock)) {
240                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
241                 goto out_unlock_intent;
242         }
243
244         /* recheck under lock */
245         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
246                         (1U << BTREE_NODE_write_in_flight))) {
247                 if (!flush) {
248                         if (btree_node_read_in_flight(b))
249                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
250                         else if (btree_node_write_in_flight(b))
251                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
252                         goto out_unlock;
253                 }
254                 six_unlock_write(&b->c.lock);
255                 six_unlock_intent(&b->c.lock);
256                 goto wait_on_io;
257         }
258
259         if (btree_node_noevict(b)) {
260                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
261                 goto out_unlock;
262         }
263         if (btree_node_write_blocked(b)) {
264                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
265                 goto out_unlock;
266         }
267         if (btree_node_will_make_reachable(b)) {
268                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
269                 goto out_unlock;
270         }
271
272         if (btree_node_dirty(b)) {
273                 if (!flush) {
274                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
275                         goto out_unlock;
276                 }
277                 /*
278                  * Using the underscore version because we don't want to compact
279                  * bsets after the write, since this node is about to be evicted
280                  * - unless btree verify mode is enabled, since it runs out of
281                  * the post write cleanup:
282                  */
283                 if (bch2_verify_btree_ondisk)
284                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
285                                               BTREE_WRITE_cache_reclaim);
286                 else
287                         __bch2_btree_node_write(c, b,
288                                                 BTREE_WRITE_cache_reclaim);
289
290                 six_unlock_write(&b->c.lock);
291                 six_unlock_intent(&b->c.lock);
292                 goto wait_on_io;
293         }
294 out:
295         if (b->hash_val && !ret)
296                 trace_and_count(c, btree_cache_reap, c, b);
297         return ret;
298 out_unlock:
299         six_unlock_write(&b->c.lock);
300 out_unlock_intent:
301         six_unlock_intent(&b->c.lock);
302         ret = -ENOMEM;
303         goto out;
304 }
305
306 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
307 {
308         return __btree_node_reclaim(c, b, false, shrinker_counter);
309 }
310
311 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
312 {
313         return __btree_node_reclaim(c, b, true, false);
314 }
315
316 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
317                                            struct shrink_control *sc)
318 {
319         struct bch_fs *c = container_of(shrink, struct bch_fs,
320                                         btree_cache.shrink);
321         struct btree_cache *bc = &c->btree_cache;
322         struct btree *b, *t;
323         unsigned long nr = sc->nr_to_scan;
324         unsigned long can_free = 0;
325         unsigned long freed = 0;
326         unsigned long touched = 0;
327         unsigned i, flags;
328         unsigned long ret = SHRINK_STOP;
329         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
330                 bc->used * 3 / 4;
331
332         if (bch2_btree_shrinker_disabled)
333                 return SHRINK_STOP;
334
335         mutex_lock(&bc->lock);
336         flags = memalloc_nofs_save();
337
338         /*
339          * It's _really_ critical that we don't free too many btree nodes - we
340          * have to always leave ourselves a reserve. The reserve is how we
341          * guarantee that allocating memory for a new btree node can always
342          * succeed, so that inserting keys into the btree can always succeed and
343          * IO can always make forward progress:
344          */
345         can_free = btree_cache_can_free(bc);
346         nr = min_t(unsigned long, nr, can_free);
347
348         i = 0;
349         list_for_each_entry_safe(b, t, &bc->freeable, list) {
350                 /*
351                  * Leave a few nodes on the freeable list, so that a btree split
352                  * won't have to hit the system allocator:
353                  */
354                 if (++i <= 3)
355                         continue;
356
357                 touched++;
358
359                 if (touched >= nr)
360                         goto out;
361
362                 if (!btree_node_reclaim(c, b, true)) {
363                         btree_node_data_free(c, b);
364                         six_unlock_write(&b->c.lock);
365                         six_unlock_intent(&b->c.lock);
366                         freed++;
367                         bc->freed++;
368                 }
369         }
370 restart:
371         list_for_each_entry_safe(b, t, &bc->live, list) {
372                 touched++;
373
374                 if (btree_node_accessed(b)) {
375                         clear_btree_node_accessed(b);
376                         bc->not_freed_access_bit++;
377                 } else if (!btree_node_reclaim(c, b, true)) {
378                         freed++;
379                         btree_node_data_free(c, b);
380                         bc->freed++;
381
382                         bch2_btree_node_hash_remove(bc, b);
383                         six_unlock_write(&b->c.lock);
384                         six_unlock_intent(&b->c.lock);
385
386                         if (freed == nr)
387                                 goto out_rotate;
388                 } else if (trigger_writes &&
389                            btree_node_dirty(b) &&
390                            !btree_node_will_make_reachable(b) &&
391                            !btree_node_write_blocked(b) &&
392                            six_trylock_read(&b->c.lock)) {
393                         list_move(&bc->live, &b->list);
394                         mutex_unlock(&bc->lock);
395                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
396                         six_unlock_read(&b->c.lock);
397                         if (touched >= nr)
398                                 goto out_nounlock;
399                         mutex_lock(&bc->lock);
400                         goto restart;
401                 }
402
403                 if (touched >= nr)
404                         break;
405         }
406 out_rotate:
407         if (&t->list != &bc->live)
408                 list_move_tail(&bc->live, &t->list);
409 out:
410         mutex_unlock(&bc->lock);
411 out_nounlock:
412         ret = freed;
413         memalloc_nofs_restore(flags);
414         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
415         return ret;
416 }
417
418 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
419                                             struct shrink_control *sc)
420 {
421         struct bch_fs *c = container_of(shrink, struct bch_fs,
422                                         btree_cache.shrink);
423         struct btree_cache *bc = &c->btree_cache;
424
425         if (bch2_btree_shrinker_disabled)
426                 return 0;
427
428         return btree_cache_can_free(bc);
429 }
430
431 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
432 {
433         struct bch_fs *c = container_of(shrink, struct bch_fs,
434                                         btree_cache.shrink);
435         char *cbuf;
436         size_t buflen = seq_buf_get_buf(s, &cbuf);
437         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
438
439         bch2_btree_cache_to_text(&out, &c->btree_cache);
440         seq_buf_commit(s, out.pos);
441 }
442
443 void bch2_fs_btree_cache_exit(struct bch_fs *c)
444 {
445         struct btree_cache *bc = &c->btree_cache;
446         struct btree *b;
447         unsigned i, flags;
448
449         if (bc->shrink.list.next)
450                 unregister_shrinker(&bc->shrink);
451
452         /* vfree() can allocate memory: */
453         flags = memalloc_nofs_save();
454         mutex_lock(&bc->lock);
455
456         if (c->verify_data)
457                 list_move(&c->verify_data->list, &bc->live);
458
459         kvpfree(c->verify_ondisk, btree_bytes(c));
460
461         for (i = 0; i < BTREE_ID_NR; i++)
462                 if (c->btree_roots[i].b)
463                         list_add(&c->btree_roots[i].b->list, &bc->live);
464
465         list_splice(&bc->freeable, &bc->live);
466
467         while (!list_empty(&bc->live)) {
468                 b = list_first_entry(&bc->live, struct btree, list);
469
470                 BUG_ON(btree_node_read_in_flight(b) ||
471                        btree_node_write_in_flight(b));
472
473                 if (btree_node_dirty(b))
474                         bch2_btree_complete_write(c, b, btree_current_write(b));
475                 clear_btree_node_dirty_acct(c, b);
476
477                 btree_node_data_free(c, b);
478         }
479
480         BUG_ON(atomic_read(&c->btree_cache.dirty));
481
482         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
483
484         while (!list_empty(&bc->freed_nonpcpu)) {
485                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
486                 list_del(&b->list);
487                 six_lock_pcpu_free(&b->c.lock);
488                 kfree(b);
489         }
490
491         mutex_unlock(&bc->lock);
492         memalloc_nofs_restore(flags);
493
494         if (bc->table_init_done)
495                 rhashtable_destroy(&bc->table);
496 }
497
498 int bch2_fs_btree_cache_init(struct bch_fs *c)
499 {
500         struct btree_cache *bc = &c->btree_cache;
501         unsigned i;
502         int ret = 0;
503
504         pr_verbose_init(c->opts, "");
505
506         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
507         if (ret)
508                 goto out;
509
510         bc->table_init_done = true;
511
512         bch2_recalc_btree_reserve(c);
513
514         for (i = 0; i < bc->reserve; i++)
515                 if (!__bch2_btree_node_mem_alloc(c)) {
516                         ret = -ENOMEM;
517                         goto out;
518                 }
519
520         list_splice_init(&bc->live, &bc->freeable);
521
522         mutex_init(&c->verify_lock);
523
524         bc->shrink.count_objects        = bch2_btree_cache_count;
525         bc->shrink.scan_objects         = bch2_btree_cache_scan;
526         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
527         bc->shrink.seeks                = 4;
528         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
529 out:
530         pr_verbose_init(c->opts, "ret %i", ret);
531         return ret;
532 }
533
534 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
535 {
536         mutex_init(&bc->lock);
537         INIT_LIST_HEAD(&bc->live);
538         INIT_LIST_HEAD(&bc->freeable);
539         INIT_LIST_HEAD(&bc->freed_pcpu);
540         INIT_LIST_HEAD(&bc->freed_nonpcpu);
541 }
542
543 /*
544  * We can only have one thread cannibalizing other cached btree nodes at a time,
545  * or we'll deadlock. We use an open coded mutex to ensure that, which a
546  * cannibalize_bucket() will take. This means every time we unlock the root of
547  * the btree, we need to release this lock if we have it held.
548  */
549 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
550 {
551         struct btree_cache *bc = &c->btree_cache;
552
553         if (bc->alloc_lock == current) {
554                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
555                 bc->alloc_lock = NULL;
556                 closure_wake_up(&bc->alloc_wait);
557         }
558 }
559
560 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
561 {
562         struct btree_cache *bc = &c->btree_cache;
563         struct task_struct *old;
564
565         old = cmpxchg(&bc->alloc_lock, NULL, current);
566         if (old == NULL || old == current)
567                 goto success;
568
569         if (!cl) {
570                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
571                 return -ENOMEM;
572         }
573
574         closure_wait(&bc->alloc_wait, cl);
575
576         /* Try again, after adding ourselves to waitlist */
577         old = cmpxchg(&bc->alloc_lock, NULL, current);
578         if (old == NULL || old == current) {
579                 /* We raced */
580                 closure_wake_up(&bc->alloc_wait);
581                 goto success;
582         }
583
584         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
585         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
586
587 success:
588         trace_and_count(c, btree_cache_cannibalize_lock, c);
589         return 0;
590 }
591
592 static struct btree *btree_node_cannibalize(struct bch_fs *c)
593 {
594         struct btree_cache *bc = &c->btree_cache;
595         struct btree *b;
596
597         list_for_each_entry_reverse(b, &bc->live, list)
598                 if (!btree_node_reclaim(c, b, false))
599                         return b;
600
601         while (1) {
602                 list_for_each_entry_reverse(b, &bc->live, list)
603                         if (!btree_node_write_and_reclaim(c, b))
604                                 return b;
605
606                 /*
607                  * Rare case: all nodes were intent-locked.
608                  * Just busy-wait.
609                  */
610                 WARN_ONCE(1, "btree cache cannibalize failed\n");
611                 cond_resched();
612         }
613 }
614
615 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
616 {
617         struct bch_fs *c = trans->c;
618         struct btree_cache *bc = &c->btree_cache;
619         struct list_head *freed = pcpu_read_locks
620                 ? &bc->freed_pcpu
621                 : &bc->freed_nonpcpu;
622         struct btree *b, *b2;
623         u64 start_time = local_clock();
624         unsigned flags;
625
626         flags = memalloc_nofs_save();
627         mutex_lock(&bc->lock);
628
629         /*
630          * We never free struct btree itself, just the memory that holds the on
631          * disk node. Check the freed list before allocating a new one:
632          */
633         list_for_each_entry(b, freed, list)
634                 if (!btree_node_reclaim(c, b, false)) {
635                         list_del_init(&b->list);
636                         goto got_node;
637                 }
638
639         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
640         if (!b) {
641                 mutex_unlock(&bc->lock);
642                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
643                 if (!b)
644                         goto err;
645                 mutex_lock(&bc->lock);
646         }
647
648         if (pcpu_read_locks)
649                 six_lock_pcpu_alloc(&b->c.lock);
650
651         BUG_ON(!six_trylock_intent(&b->c.lock));
652         BUG_ON(!six_trylock_write(&b->c.lock));
653 got_node:
654
655         /*
656          * btree_free() doesn't free memory; it sticks the node on the end of
657          * the list. Check if there's any freed nodes there:
658          */
659         list_for_each_entry(b2, &bc->freeable, list)
660                 if (!btree_node_reclaim(c, b2, false)) {
661                         swap(b->data, b2->data);
662                         swap(b->aux_data, b2->aux_data);
663                         btree_node_to_freedlist(bc, b2);
664                         six_unlock_write(&b2->c.lock);
665                         six_unlock_intent(&b2->c.lock);
666                         goto got_mem;
667                 }
668
669         mutex_unlock(&bc->lock);
670
671         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
672                 goto err;
673
674         mutex_lock(&bc->lock);
675         bc->used++;
676 got_mem:
677         mutex_unlock(&bc->lock);
678
679         BUG_ON(btree_node_hashed(b));
680         BUG_ON(btree_node_dirty(b));
681         BUG_ON(btree_node_write_in_flight(b));
682 out:
683         b->flags                = 0;
684         b->written              = 0;
685         b->nsets                = 0;
686         b->sib_u64s[0]          = 0;
687         b->sib_u64s[1]          = 0;
688         b->whiteout_u64s        = 0;
689         bch2_btree_keys_init(b);
690         set_btree_node_accessed(b);
691
692         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
693                                start_time);
694
695         memalloc_nofs_restore(flags);
696         return b;
697 err:
698         mutex_lock(&bc->lock);
699
700         /* Try to cannibalize another cached btree node: */
701         if (bc->alloc_lock == current) {
702                 b2 = btree_node_cannibalize(c);
703                 bch2_btree_node_hash_remove(bc, b2);
704
705                 if (b) {
706                         swap(b->data, b2->data);
707                         swap(b->aux_data, b2->aux_data);
708                         btree_node_to_freedlist(bc, b2);
709                         six_unlock_write(&b2->c.lock);
710                         six_unlock_intent(&b2->c.lock);
711                 } else {
712                         b = b2;
713                         list_del_init(&b->list);
714                 }
715
716                 mutex_unlock(&bc->lock);
717
718                 trace_and_count(c, btree_cache_cannibalize, c);
719                 goto out;
720         }
721
722         mutex_unlock(&bc->lock);
723         memalloc_nofs_restore(flags);
724         return ERR_PTR(-ENOMEM);
725 }
726
727 /* Slowpath, don't want it inlined into btree_iter_traverse() */
728 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
729                                 struct btree_path *path,
730                                 const struct bkey_i *k,
731                                 enum btree_id btree_id,
732                                 unsigned level,
733                                 enum six_lock_type lock_type,
734                                 bool sync)
735 {
736         struct bch_fs *c = trans->c;
737         struct btree_cache *bc = &c->btree_cache;
738         struct btree *b;
739         u32 seq;
740
741         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
742         /*
743          * Parent node must be locked, else we could read in a btree node that's
744          * been freed:
745          */
746         if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
747                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
748                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
749         }
750
751         b = bch2_btree_node_mem_alloc(trans, level != 0);
752
753         if (b == ERR_PTR(-ENOMEM)) {
754                 trans->memory_allocation_failure = true;
755                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
756                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
757         }
758
759         if (IS_ERR(b))
760                 return b;
761
762         /*
763          * Btree nodes read in from disk should not have the accessed bit set
764          * initially, so that linear scans don't thrash the cache:
765          */
766         clear_btree_node_accessed(b);
767
768         bkey_copy(&b->key, k);
769         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
770                 /* raced with another fill: */
771
772                 /* mark as unhashed... */
773                 b->hash_val = 0;
774
775                 mutex_lock(&bc->lock);
776                 list_add(&b->list, &bc->freeable);
777                 mutex_unlock(&bc->lock);
778
779                 six_unlock_write(&b->c.lock);
780                 six_unlock_intent(&b->c.lock);
781                 return NULL;
782         }
783
784         set_btree_node_read_in_flight(b);
785
786         six_unlock_write(&b->c.lock);
787         seq = b->c.lock.state.seq;
788         six_unlock_intent(&b->c.lock);
789
790         /* Unlock before doing IO: */
791         if (trans && sync)
792                 bch2_trans_unlock(trans);
793
794         bch2_btree_node_read(c, b, sync);
795
796         if (!sync)
797                 return NULL;
798
799         if (path) {
800                 int ret = bch2_trans_relock(trans) ?:
801                         bch2_btree_path_relock_intent(trans, path);
802                 if (ret) {
803                         BUG_ON(!trans->restarted);
804                         return ERR_PTR(ret);
805                 }
806         }
807
808         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
809                 if (path)
810                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
811                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
812         }
813
814         return b;
815 }
816
817 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
818 {
819         struct printbuf buf = PRINTBUF;
820
821         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
822                 return;
823
824         prt_printf(&buf,
825                "btree node header doesn't match ptr\n"
826                "btree %s level %u\n"
827                "ptr: ",
828                bch2_btree_ids[b->c.btree_id], b->c.level);
829         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
830
831         prt_printf(&buf, "\nheader: btree %s level %llu\n"
832                "min ",
833                bch2_btree_ids[BTREE_NODE_ID(b->data)],
834                BTREE_NODE_LEVEL(b->data));
835         bch2_bpos_to_text(&buf, b->data->min_key);
836
837         prt_printf(&buf, "\nmax ");
838         bch2_bpos_to_text(&buf, b->data->max_key);
839
840         bch2_fs_inconsistent(c, "%s", buf.buf);
841         printbuf_exit(&buf);
842 }
843
844 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
845 {
846         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
847             b->c.level != BTREE_NODE_LEVEL(b->data) ||
848             !bpos_eq(b->data->max_key, b->key.k.p) ||
849             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
850              !bpos_eq(b->data->min_key,
851                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
852                 btree_bad_header(c, b);
853 }
854
855 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
856                                            const struct bkey_i *k, unsigned level,
857                                            enum six_lock_type lock_type,
858                                            unsigned long trace_ip)
859 {
860         struct bch_fs *c = trans->c;
861         struct btree_cache *bc = &c->btree_cache;
862         struct btree *b;
863         struct bset_tree *t;
864         int ret;
865
866         EBUG_ON(level >= BTREE_MAX_DEPTH);
867 retry:
868         b = btree_cache_find(bc, k);
869         if (unlikely(!b)) {
870                 /*
871                  * We must have the parent locked to call bch2_btree_node_fill(),
872                  * else we could read in a btree node from disk that's been
873                  * freed:
874                  */
875                 b = bch2_btree_node_fill(trans, path, k, path->btree_id,
876                                          level, lock_type, true);
877
878                 /* We raced and found the btree node in the cache */
879                 if (!b)
880                         goto retry;
881
882                 if (IS_ERR(b))
883                         return b;
884         } else {
885                 if (btree_node_read_locked(path, level + 1))
886                         btree_node_unlock(trans, path, level + 1);
887
888                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
889                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
890                         return ERR_PTR(ret);
891
892                 BUG_ON(ret);
893
894                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
895                              b->c.level != level ||
896                              race_fault())) {
897                         six_unlock_type(&b->c.lock, lock_type);
898                         if (bch2_btree_node_relock(trans, path, level + 1))
899                                 goto retry;
900
901                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
902                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
903                 }
904
905                 /* avoid atomic set bit if it's not needed: */
906                 if (!btree_node_accessed(b))
907                         set_btree_node_accessed(b);
908         }
909
910         if (unlikely(btree_node_read_in_flight(b))) {
911                 u32 seq = b->c.lock.state.seq;
912
913                 six_unlock_type(&b->c.lock, lock_type);
914                 bch2_trans_unlock(trans);
915
916                 bch2_btree_node_wait_on_read(b);
917
918                 /*
919                  * should_be_locked is not set on this path yet, so we need to
920                  * relock it specifically:
921                  */
922                 if (trans) {
923                         int ret = bch2_trans_relock(trans) ?:
924                                 bch2_btree_path_relock_intent(trans, path);
925                         if (ret) {
926                                 BUG_ON(!trans->restarted);
927                                 return ERR_PTR(ret);
928                         }
929                 }
930
931                 if (!six_relock_type(&b->c.lock, lock_type, seq))
932                         goto retry;
933         }
934
935         prefetch(b->aux_data);
936
937         for_each_bset(b, t) {
938                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
939
940                 prefetch(p + L1_CACHE_BYTES * 0);
941                 prefetch(p + L1_CACHE_BYTES * 1);
942                 prefetch(p + L1_CACHE_BYTES * 2);
943         }
944
945         if (unlikely(btree_node_read_error(b))) {
946                 six_unlock_type(&b->c.lock, lock_type);
947                 return ERR_PTR(-EIO);
948         }
949
950         EBUG_ON(b->c.btree_id != path->btree_id);
951         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
952         btree_check_header(c, b);
953
954         return b;
955 }
956
957 /**
958  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
959  * in from disk if necessary.
960  *
961  * The btree node will have either a read or a write lock held, depending on
962  * the @write parameter.
963  */
964 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
965                                   const struct bkey_i *k, unsigned level,
966                                   enum six_lock_type lock_type,
967                                   unsigned long trace_ip)
968 {
969         struct bch_fs *c = trans->c;
970         struct btree *b;
971         struct bset_tree *t;
972         int ret;
973
974         EBUG_ON(level >= BTREE_MAX_DEPTH);
975
976         b = btree_node_mem_ptr(k);
977
978         /*
979          * Check b->hash_val _before_ calling btree_node_lock() - this might not
980          * be the node we want anymore, and trying to lock the wrong node could
981          * cause an unneccessary transaction restart:
982          */
983         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
984                      !b ||
985                      b->hash_val != btree_ptr_hash_val(k)))
986                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
987
988         if (btree_node_read_locked(path, level + 1))
989                 btree_node_unlock(trans, path, level + 1);
990
991         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
992         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
993                 return ERR_PTR(ret);
994
995         BUG_ON(ret);
996
997         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
998                      b->c.level != level ||
999                      race_fault())) {
1000                 six_unlock_type(&b->c.lock, lock_type);
1001                 if (bch2_btree_node_relock(trans, path, level + 1))
1002                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1003
1004                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1005                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1006         }
1007
1008         if (unlikely(btree_node_read_in_flight(b))) {
1009                 u32 seq = b->c.lock.state.seq;
1010
1011                 six_unlock_type(&b->c.lock, lock_type);
1012                 bch2_trans_unlock(trans);
1013
1014                 bch2_btree_node_wait_on_read(b);
1015
1016                 /*
1017                  * should_be_locked is not set on this path yet, so we need to
1018                  * relock it specifically:
1019                  */
1020                 if (trans) {
1021                         int ret = bch2_trans_relock(trans) ?:
1022                                 bch2_btree_path_relock_intent(trans, path);
1023                         if (ret) {
1024                                 BUG_ON(!trans->restarted);
1025                                 return ERR_PTR(ret);
1026                         }
1027                 }
1028
1029                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1030                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1031         }
1032
1033         prefetch(b->aux_data);
1034
1035         for_each_bset(b, t) {
1036                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1037
1038                 prefetch(p + L1_CACHE_BYTES * 0);
1039                 prefetch(p + L1_CACHE_BYTES * 1);
1040                 prefetch(p + L1_CACHE_BYTES * 2);
1041         }
1042
1043         /* avoid atomic set bit if it's not needed: */
1044         if (!btree_node_accessed(b))
1045                 set_btree_node_accessed(b);
1046
1047         if (unlikely(btree_node_read_error(b))) {
1048                 six_unlock_type(&b->c.lock, lock_type);
1049                 return ERR_PTR(-EIO);
1050         }
1051
1052         EBUG_ON(b->c.btree_id != path->btree_id);
1053         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1054         btree_check_header(c, b);
1055
1056         return b;
1057 }
1058
1059 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1060                                          const struct bkey_i *k,
1061                                          enum btree_id btree_id,
1062                                          unsigned level,
1063                                          bool nofill)
1064 {
1065         struct bch_fs *c = trans->c;
1066         struct btree_cache *bc = &c->btree_cache;
1067         struct btree *b;
1068         struct bset_tree *t;
1069         int ret;
1070
1071         EBUG_ON(level >= BTREE_MAX_DEPTH);
1072
1073         if (c->opts.btree_node_mem_ptr_optimization) {
1074                 b = btree_node_mem_ptr(k);
1075                 if (b)
1076                         goto lock_node;
1077         }
1078 retry:
1079         b = btree_cache_find(bc, k);
1080         if (unlikely(!b)) {
1081                 if (nofill)
1082                         goto out;
1083
1084                 b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1085                                          level, SIX_LOCK_read, true);
1086
1087                 /* We raced and found the btree node in the cache */
1088                 if (!b)
1089                         goto retry;
1090
1091                 if (IS_ERR(b) &&
1092                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1093                         goto retry;
1094
1095                 if (IS_ERR(b))
1096                         goto out;
1097         } else {
1098 lock_node:
1099                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1100                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1101                         return ERR_PTR(ret);
1102
1103                 BUG_ON(ret);
1104
1105                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1106                              b->c.btree_id != btree_id ||
1107                              b->c.level != level)) {
1108                         six_unlock_read(&b->c.lock);
1109                         goto retry;
1110                 }
1111         }
1112
1113         /* XXX: waiting on IO with btree locks held: */
1114         __bch2_btree_node_wait_on_read(b);
1115
1116         prefetch(b->aux_data);
1117
1118         for_each_bset(b, t) {
1119                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1120
1121                 prefetch(p + L1_CACHE_BYTES * 0);
1122                 prefetch(p + L1_CACHE_BYTES * 1);
1123                 prefetch(p + L1_CACHE_BYTES * 2);
1124         }
1125
1126         /* avoid atomic set bit if it's not needed: */
1127         if (!btree_node_accessed(b))
1128                 set_btree_node_accessed(b);
1129
1130         if (unlikely(btree_node_read_error(b))) {
1131                 six_unlock_read(&b->c.lock);
1132                 b = ERR_PTR(-EIO);
1133                 goto out;
1134         }
1135
1136         EBUG_ON(b->c.btree_id != btree_id);
1137         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1138         btree_check_header(c, b);
1139 out:
1140         bch2_btree_cache_cannibalize_unlock(c);
1141         return b;
1142 }
1143
1144 int bch2_btree_node_prefetch(struct btree_trans *trans,
1145                              struct btree_path *path,
1146                              const struct bkey_i *k,
1147                              enum btree_id btree_id, unsigned level)
1148 {
1149         struct bch_fs *c = trans->c;
1150         struct btree_cache *bc = &c->btree_cache;
1151         struct btree *b;
1152
1153         BUG_ON(trans && !btree_node_locked(path, level + 1));
1154         BUG_ON(level >= BTREE_MAX_DEPTH);
1155
1156         b = btree_cache_find(bc, k);
1157         if (b)
1158                 return 0;
1159
1160         b = bch2_btree_node_fill(trans, path, k, btree_id,
1161                                  level, SIX_LOCK_read, false);
1162         return PTR_ERR_OR_ZERO(b);
1163 }
1164
1165 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1166 {
1167         struct bch_fs *c = trans->c;
1168         struct btree_cache *bc = &c->btree_cache;
1169         struct btree *b;
1170
1171         b = btree_cache_find(bc, k);
1172         if (!b)
1173                 return;
1174 wait_on_io:
1175         /* not allowed to wait on io with btree locks held: */
1176
1177         /* XXX we're called from btree_gc which will be holding other btree
1178          * nodes locked
1179          */
1180         __bch2_btree_node_wait_on_read(b);
1181         __bch2_btree_node_wait_on_write(b);
1182
1183         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1184         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1185
1186         if (btree_node_dirty(b)) {
1187                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1188                 six_unlock_write(&b->c.lock);
1189                 six_unlock_intent(&b->c.lock);
1190                 goto wait_on_io;
1191         }
1192
1193         BUG_ON(btree_node_dirty(b));
1194
1195         mutex_lock(&bc->lock);
1196         btree_node_data_free(c, b);
1197         bch2_btree_node_hash_remove(bc, b);
1198         mutex_unlock(&bc->lock);
1199
1200         six_unlock_write(&b->c.lock);
1201         six_unlock_intent(&b->c.lock);
1202 }
1203
1204 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1205                              const struct btree *b)
1206 {
1207         const struct bkey_format *f = &b->format;
1208         struct bset_stats stats;
1209
1210         memset(&stats, 0, sizeof(stats));
1211
1212         bch2_btree_keys_stats(b, &stats);
1213
1214         prt_printf(out, "l %u ", b->c.level);
1215         bch2_bpos_to_text(out, b->data->min_key);
1216         prt_printf(out, " - ");
1217         bch2_bpos_to_text(out, b->data->max_key);
1218         prt_printf(out, ":\n"
1219                "    ptrs: ");
1220         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1221
1222         prt_printf(out, "\n"
1223                "    format: u64s %u fields %u %u %u %u %u\n"
1224                "    unpack fn len: %u\n"
1225                "    bytes used %zu/%zu (%zu%% full)\n"
1226                "    sib u64s: %u, %u (merge threshold %u)\n"
1227                "    nr packed keys %u\n"
1228                "    nr unpacked keys %u\n"
1229                "    floats %zu\n"
1230                "    failed unpacked %zu\n",
1231                f->key_u64s,
1232                f->bits_per_field[0],
1233                f->bits_per_field[1],
1234                f->bits_per_field[2],
1235                f->bits_per_field[3],
1236                f->bits_per_field[4],
1237                b->unpack_fn_len,
1238                b->nr.live_u64s * sizeof(u64),
1239                btree_bytes(c) - sizeof(struct btree_node),
1240                b->nr.live_u64s * 100 / btree_max_u64s(c),
1241                b->sib_u64s[0],
1242                b->sib_u64s[1],
1243                c->btree_foreground_merge_threshold,
1244                b->nr.packed_keys,
1245                b->nr.unpacked_keys,
1246                stats.floats,
1247                stats.failed);
1248 }
1249
1250 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1251 {
1252         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1253         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1254         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1255
1256         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1257         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1258         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1259         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1260         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1261         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1262         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1263         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1264         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1265         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1266
1267 }