]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 1b14994029 bcachefs: Fragmentation LRU
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <linux/seq_buf.h>
16 #include <trace/events/bcachefs.h>
17
18 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
19 do {                                             \
20         if (shrinker_counter)                    \
21                 bc->not_freed_##counter++;       \
22 } while (0)
23
24 const char * const bch2_btree_node_flags[] = {
25 #define x(f)    #f,
26         BTREE_FLAGS()
27 #undef x
28         NULL
29 };
30
31 void bch2_recalc_btree_reserve(struct bch_fs *c)
32 {
33         unsigned i, reserve = 16;
34
35         if (!c->btree_roots[0].b)
36                 reserve += 8;
37
38         for (i = 0; i < BTREE_ID_NR; i++)
39                 if (c->btree_roots[i].b)
40                         reserve += min_t(unsigned, 1,
41                                          c->btree_roots[i].b->c.level) * 8;
42
43         c->btree_cache.reserve = reserve;
44 }
45
46 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
47 {
48         return max_t(int, 0, bc->used - bc->reserve);
49 }
50
51 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
52 {
53         if (b->c.lock.readers)
54                 list_move(&b->list, &bc->freed_pcpu);
55         else
56                 list_move(&b->list, &bc->freed_nonpcpu);
57 }
58
59 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
60 {
61         struct btree_cache *bc = &c->btree_cache;
62
63         EBUG_ON(btree_node_write_in_flight(b));
64
65         kvpfree(b->data, btree_bytes(c));
66         b->data = NULL;
67 #ifdef __KERNEL__
68         vfree(b->aux_data);
69 #else
70         munmap(b->aux_data, btree_aux_data_bytes(b));
71 #endif
72         b->aux_data = NULL;
73
74         bc->used--;
75
76         btree_node_to_freedlist(bc, b);
77 }
78
79 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
80                                    const void *obj)
81 {
82         const struct btree *b = obj;
83         const u64 *v = arg->key;
84
85         return b->hash_val == *v ? 0 : 1;
86 }
87
88 static const struct rhashtable_params bch_btree_cache_params = {
89         .head_offset    = offsetof(struct btree, hash),
90         .key_offset     = offsetof(struct btree, hash_val),
91         .key_len        = sizeof(u64),
92         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
93 };
94
95 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
96 {
97         BUG_ON(b->data || b->aux_data);
98
99         b->data = kvpmalloc(btree_bytes(c), gfp);
100         if (!b->data)
101                 return -ENOMEM;
102 #ifdef __KERNEL__
103         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
104 #else
105         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
106                            PROT_READ|PROT_WRITE|PROT_EXEC,
107                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
108         if (b->aux_data == MAP_FAILED)
109                 b->aux_data = NULL;
110 #endif
111         if (!b->aux_data) {
112                 kvpfree(b->data, btree_bytes(c));
113                 b->data = NULL;
114                 return -ENOMEM;
115         }
116
117         return 0;
118 }
119
120 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
121 {
122         struct btree *b;
123
124         b = kzalloc(sizeof(struct btree), gfp);
125         if (!b)
126                 return NULL;
127
128         bkey_btree_ptr_init(&b->key);
129         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
130 #ifdef CONFIG_DEBUG_LOCK_ALLOC
131         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
132 #endif
133         INIT_LIST_HEAD(&b->list);
134         INIT_LIST_HEAD(&b->write_blocked);
135         b->byte_order = ilog2(btree_bytes(c));
136         return b;
137 }
138
139 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
140 {
141         struct btree_cache *bc = &c->btree_cache;
142         struct btree *b;
143
144         b = __btree_node_mem_alloc(c, GFP_KERNEL);
145         if (!b)
146                 return NULL;
147
148         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
149                 kfree(b);
150                 return NULL;
151         }
152
153         bc->used++;
154         list_add(&b->list, &bc->freeable);
155         return b;
156 }
157
158 /* Btree in memory cache - hash table */
159
160 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
161 {
162         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
163
164         BUG_ON(ret);
165
166         /* Cause future lookups for this node to fail: */
167         b->hash_val = 0;
168 }
169
170 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
171 {
172         BUG_ON(b->hash_val);
173         b->hash_val = btree_ptr_hash_val(&b->key);
174
175         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
176                                              bch_btree_cache_params);
177 }
178
179 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
180                                 unsigned level, enum btree_id id)
181 {
182         int ret;
183
184         b->c.level      = level;
185         b->c.btree_id   = id;
186
187         mutex_lock(&bc->lock);
188         ret = __bch2_btree_node_hash_insert(bc, b);
189         if (!ret)
190                 list_add_tail(&b->list, &bc->live);
191         mutex_unlock(&bc->lock);
192
193         return ret;
194 }
195
196 __flatten
197 static inline struct btree *btree_cache_find(struct btree_cache *bc,
198                                      const struct bkey_i *k)
199 {
200         u64 v = btree_ptr_hash_val(k);
201
202         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
203 }
204
205 /*
206  * this version is for btree nodes that have already been freed (we're not
207  * reaping a real btree node)
208  */
209 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
210 {
211         struct btree_cache *bc = &c->btree_cache;
212         int ret = 0;
213
214         lockdep_assert_held(&bc->lock);
215 wait_on_io:
216         if (b->flags & ((1U << BTREE_NODE_dirty)|
217                         (1U << BTREE_NODE_read_in_flight)|
218                         (1U << BTREE_NODE_write_in_flight))) {
219                 if (!flush) {
220                         if (btree_node_dirty(b))
221                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
222                         else if (btree_node_read_in_flight(b))
223                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
224                         else if (btree_node_write_in_flight(b))
225                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
226                         return -ENOMEM;
227                 }
228
229                 /* XXX: waiting on IO with btree cache lock held */
230                 bch2_btree_node_wait_on_read(b);
231                 bch2_btree_node_wait_on_write(b);
232         }
233
234         if (!six_trylock_intent(&b->c.lock)) {
235                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
236                 return -ENOMEM;
237         }
238
239         if (!six_trylock_write(&b->c.lock)) {
240                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
241                 goto out_unlock_intent;
242         }
243
244         /* recheck under lock */
245         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
246                         (1U << BTREE_NODE_write_in_flight))) {
247                 if (!flush) {
248                         if (btree_node_read_in_flight(b))
249                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
250                         else if (btree_node_write_in_flight(b))
251                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
252                         goto out_unlock;
253                 }
254                 six_unlock_write(&b->c.lock);
255                 six_unlock_intent(&b->c.lock);
256                 goto wait_on_io;
257         }
258
259         if (btree_node_noevict(b)) {
260                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
261                 goto out_unlock;
262         }
263         if (btree_node_write_blocked(b)) {
264                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
265                 goto out_unlock;
266         }
267         if (btree_node_will_make_reachable(b)) {
268                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
269                 goto out_unlock;
270         }
271
272         if (btree_node_dirty(b)) {
273                 if (!flush) {
274                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
275                         goto out_unlock;
276                 }
277                 /*
278                  * Using the underscore version because we don't want to compact
279                  * bsets after the write, since this node is about to be evicted
280                  * - unless btree verify mode is enabled, since it runs out of
281                  * the post write cleanup:
282                  */
283                 if (bch2_verify_btree_ondisk)
284                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
285                                               BTREE_WRITE_cache_reclaim);
286                 else
287                         __bch2_btree_node_write(c, b,
288                                                 BTREE_WRITE_cache_reclaim);
289
290                 six_unlock_write(&b->c.lock);
291                 six_unlock_intent(&b->c.lock);
292                 goto wait_on_io;
293         }
294 out:
295         if (b->hash_val && !ret)
296                 trace_and_count(c, btree_cache_reap, c, b);
297         return ret;
298 out_unlock:
299         six_unlock_write(&b->c.lock);
300 out_unlock_intent:
301         six_unlock_intent(&b->c.lock);
302         ret = -ENOMEM;
303         goto out;
304 }
305
306 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
307 {
308         return __btree_node_reclaim(c, b, false, shrinker_counter);
309 }
310
311 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
312 {
313         return __btree_node_reclaim(c, b, true, false);
314 }
315
316 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
317                                            struct shrink_control *sc)
318 {
319         struct bch_fs *c = container_of(shrink, struct bch_fs,
320                                         btree_cache.shrink);
321         struct btree_cache *bc = &c->btree_cache;
322         struct btree *b, *t;
323         unsigned long nr = sc->nr_to_scan;
324         unsigned long can_free = 0;
325         unsigned long freed = 0;
326         unsigned long touched = 0;
327         unsigned i, flags;
328         unsigned long ret = SHRINK_STOP;
329         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
330                 bc->used * 3 / 4;
331
332         if (bch2_btree_shrinker_disabled)
333                 return SHRINK_STOP;
334
335         mutex_lock(&bc->lock);
336         flags = memalloc_nofs_save();
337
338         /*
339          * It's _really_ critical that we don't free too many btree nodes - we
340          * have to always leave ourselves a reserve. The reserve is how we
341          * guarantee that allocating memory for a new btree node can always
342          * succeed, so that inserting keys into the btree can always succeed and
343          * IO can always make forward progress:
344          */
345         can_free = btree_cache_can_free(bc);
346         nr = min_t(unsigned long, nr, can_free);
347
348         i = 0;
349         list_for_each_entry_safe(b, t, &bc->freeable, list) {
350                 /*
351                  * Leave a few nodes on the freeable list, so that a btree split
352                  * won't have to hit the system allocator:
353                  */
354                 if (++i <= 3)
355                         continue;
356
357                 touched++;
358
359                 if (touched >= nr)
360                         goto out;
361
362                 if (!btree_node_reclaim(c, b, true)) {
363                         btree_node_data_free(c, b);
364                         six_unlock_write(&b->c.lock);
365                         six_unlock_intent(&b->c.lock);
366                         freed++;
367                         bc->freed++;
368                 }
369         }
370 restart:
371         list_for_each_entry_safe(b, t, &bc->live, list) {
372                 touched++;
373
374                 if (btree_node_accessed(b)) {
375                         clear_btree_node_accessed(b);
376                         bc->not_freed_access_bit++;
377                 } else if (!btree_node_reclaim(c, b, true)) {
378                         freed++;
379                         btree_node_data_free(c, b);
380                         bc->freed++;
381
382                         bch2_btree_node_hash_remove(bc, b);
383                         six_unlock_write(&b->c.lock);
384                         six_unlock_intent(&b->c.lock);
385
386                         if (freed == nr)
387                                 goto out_rotate;
388                 } else if (trigger_writes &&
389                            btree_node_dirty(b) &&
390                            !btree_node_will_make_reachable(b) &&
391                            !btree_node_write_blocked(b) &&
392                            six_trylock_read(&b->c.lock)) {
393                         list_move(&bc->live, &b->list);
394                         mutex_unlock(&bc->lock);
395                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
396                         six_unlock_read(&b->c.lock);
397                         if (touched >= nr)
398                                 goto out_nounlock;
399                         mutex_lock(&bc->lock);
400                         goto restart;
401                 }
402
403                 if (touched >= nr)
404                         break;
405         }
406 out_rotate:
407         if (&t->list != &bc->live)
408                 list_move_tail(&bc->live, &t->list);
409 out:
410         mutex_unlock(&bc->lock);
411 out_nounlock:
412         ret = freed;
413         memalloc_nofs_restore(flags);
414         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
415         return ret;
416 }
417
418 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
419                                             struct shrink_control *sc)
420 {
421         struct bch_fs *c = container_of(shrink, struct bch_fs,
422                                         btree_cache.shrink);
423         struct btree_cache *bc = &c->btree_cache;
424
425         if (bch2_btree_shrinker_disabled)
426                 return 0;
427
428         return btree_cache_can_free(bc);
429 }
430
431 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
432 {
433         struct bch_fs *c = container_of(shrink, struct bch_fs,
434                                         btree_cache.shrink);
435         char *cbuf;
436         size_t buflen = seq_buf_get_buf(s, &cbuf);
437         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
438
439         bch2_btree_cache_to_text(&out, &c->btree_cache);
440         seq_buf_commit(s, out.pos);
441 }
442
443 void bch2_fs_btree_cache_exit(struct bch_fs *c)
444 {
445         struct btree_cache *bc = &c->btree_cache;
446         struct btree *b;
447         unsigned i, flags;
448
449         if (bc->shrink.list.next)
450                 unregister_shrinker(&bc->shrink);
451
452         /* vfree() can allocate memory: */
453         flags = memalloc_nofs_save();
454         mutex_lock(&bc->lock);
455
456         if (c->verify_data)
457                 list_move(&c->verify_data->list, &bc->live);
458
459         kvpfree(c->verify_ondisk, btree_bytes(c));
460
461         for (i = 0; i < BTREE_ID_NR; i++)
462                 if (c->btree_roots[i].b)
463                         list_add(&c->btree_roots[i].b->list, &bc->live);
464
465         list_splice(&bc->freeable, &bc->live);
466
467         while (!list_empty(&bc->live)) {
468                 b = list_first_entry(&bc->live, struct btree, list);
469
470                 BUG_ON(btree_node_read_in_flight(b) ||
471                        btree_node_write_in_flight(b));
472
473                 if (btree_node_dirty(b))
474                         bch2_btree_complete_write(c, b, btree_current_write(b));
475                 clear_btree_node_dirty_acct(c, b);
476
477                 btree_node_data_free(c, b);
478         }
479
480         BUG_ON(atomic_read(&c->btree_cache.dirty));
481
482         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
483
484         while (!list_empty(&bc->freed_nonpcpu)) {
485                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
486                 list_del(&b->list);
487                 six_lock_pcpu_free(&b->c.lock);
488                 kfree(b);
489         }
490
491         mutex_unlock(&bc->lock);
492         memalloc_nofs_restore(flags);
493
494         if (bc->table_init_done)
495                 rhashtable_destroy(&bc->table);
496 }
497
498 int bch2_fs_btree_cache_init(struct bch_fs *c)
499 {
500         struct btree_cache *bc = &c->btree_cache;
501         unsigned i;
502         int ret = 0;
503
504         pr_verbose_init(c->opts, "");
505
506         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
507         if (ret)
508                 goto out;
509
510         bc->table_init_done = true;
511
512         bch2_recalc_btree_reserve(c);
513
514         for (i = 0; i < bc->reserve; i++)
515                 if (!__bch2_btree_node_mem_alloc(c)) {
516                         ret = -ENOMEM;
517                         goto out;
518                 }
519
520         list_splice_init(&bc->live, &bc->freeable);
521
522         mutex_init(&c->verify_lock);
523
524         bc->shrink.count_objects        = bch2_btree_cache_count;
525         bc->shrink.scan_objects         = bch2_btree_cache_scan;
526         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
527         bc->shrink.seeks                = 4;
528         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
529 out:
530         pr_verbose_init(c->opts, "ret %i", ret);
531         return ret;
532 }
533
534 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
535 {
536         mutex_init(&bc->lock);
537         INIT_LIST_HEAD(&bc->live);
538         INIT_LIST_HEAD(&bc->freeable);
539         INIT_LIST_HEAD(&bc->freed_pcpu);
540         INIT_LIST_HEAD(&bc->freed_nonpcpu);
541 }
542
543 /*
544  * We can only have one thread cannibalizing other cached btree nodes at a time,
545  * or we'll deadlock. We use an open coded mutex to ensure that, which a
546  * cannibalize_bucket() will take. This means every time we unlock the root of
547  * the btree, we need to release this lock if we have it held.
548  */
549 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
550 {
551         struct btree_cache *bc = &c->btree_cache;
552
553         if (bc->alloc_lock == current) {
554                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
555                 bc->alloc_lock = NULL;
556                 closure_wake_up(&bc->alloc_wait);
557         }
558 }
559
560 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
561 {
562         struct btree_cache *bc = &c->btree_cache;
563         struct task_struct *old;
564
565         old = cmpxchg(&bc->alloc_lock, NULL, current);
566         if (old == NULL || old == current)
567                 goto success;
568
569         if (!cl) {
570                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
571                 return -ENOMEM;
572         }
573
574         closure_wait(&bc->alloc_wait, cl);
575
576         /* Try again, after adding ourselves to waitlist */
577         old = cmpxchg(&bc->alloc_lock, NULL, current);
578         if (old == NULL || old == current) {
579                 /* We raced */
580                 closure_wake_up(&bc->alloc_wait);
581                 goto success;
582         }
583
584         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
585         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
586
587 success:
588         trace_and_count(c, btree_cache_cannibalize_lock, c);
589         return 0;
590 }
591
592 static struct btree *btree_node_cannibalize(struct bch_fs *c)
593 {
594         struct btree_cache *bc = &c->btree_cache;
595         struct btree *b;
596
597         list_for_each_entry_reverse(b, &bc->live, list)
598                 if (!btree_node_reclaim(c, b, false))
599                         return b;
600
601         while (1) {
602                 list_for_each_entry_reverse(b, &bc->live, list)
603                         if (!btree_node_write_and_reclaim(c, b))
604                                 return b;
605
606                 /*
607                  * Rare case: all nodes were intent-locked.
608                  * Just busy-wait.
609                  */
610                 WARN_ONCE(1, "btree cache cannibalize failed\n");
611                 cond_resched();
612         }
613 }
614
615 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
616 {
617         struct btree_cache *bc = &c->btree_cache;
618         struct list_head *freed = pcpu_read_locks
619                 ? &bc->freed_pcpu
620                 : &bc->freed_nonpcpu;
621         struct btree *b, *b2;
622         u64 start_time = local_clock();
623         unsigned flags;
624
625         flags = memalloc_nofs_save();
626         mutex_lock(&bc->lock);
627
628         /*
629          * We never free struct btree itself, just the memory that holds the on
630          * disk node. Check the freed list before allocating a new one:
631          */
632         list_for_each_entry(b, freed, list)
633                 if (!btree_node_reclaim(c, b, false)) {
634                         list_del_init(&b->list);
635                         goto got_node;
636                 }
637
638         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
639         if (!b) {
640                 mutex_unlock(&bc->lock);
641                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
642                 if (!b)
643                         goto err;
644                 mutex_lock(&bc->lock);
645         }
646
647         if (pcpu_read_locks)
648                 six_lock_pcpu_alloc(&b->c.lock);
649
650         BUG_ON(!six_trylock_intent(&b->c.lock));
651         BUG_ON(!six_trylock_write(&b->c.lock));
652 got_node:
653
654         /*
655          * btree_free() doesn't free memory; it sticks the node on the end of
656          * the list. Check if there's any freed nodes there:
657          */
658         list_for_each_entry(b2, &bc->freeable, list)
659                 if (!btree_node_reclaim(c, b2, false)) {
660                         swap(b->data, b2->data);
661                         swap(b->aux_data, b2->aux_data);
662                         btree_node_to_freedlist(bc, b2);
663                         six_unlock_write(&b2->c.lock);
664                         six_unlock_intent(&b2->c.lock);
665                         goto got_mem;
666                 }
667
668         mutex_unlock(&bc->lock);
669
670         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
671                 goto err;
672
673         mutex_lock(&bc->lock);
674         bc->used++;
675 got_mem:
676         mutex_unlock(&bc->lock);
677
678         BUG_ON(btree_node_hashed(b));
679         BUG_ON(btree_node_dirty(b));
680         BUG_ON(btree_node_write_in_flight(b));
681 out:
682         b->flags                = 0;
683         b->written              = 0;
684         b->nsets                = 0;
685         b->sib_u64s[0]          = 0;
686         b->sib_u64s[1]          = 0;
687         b->whiteout_u64s        = 0;
688         bch2_btree_keys_init(b);
689         set_btree_node_accessed(b);
690
691         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
692                                start_time);
693
694         memalloc_nofs_restore(flags);
695         return b;
696 err:
697         mutex_lock(&bc->lock);
698
699         /* Try to cannibalize another cached btree node: */
700         if (bc->alloc_lock == current) {
701                 b2 = btree_node_cannibalize(c);
702                 bch2_btree_node_hash_remove(bc, b2);
703
704                 if (b) {
705                         swap(b->data, b2->data);
706                         swap(b->aux_data, b2->aux_data);
707                         btree_node_to_freedlist(bc, b2);
708                         six_unlock_write(&b2->c.lock);
709                         six_unlock_intent(&b2->c.lock);
710                 } else {
711                         b = b2;
712                         list_del_init(&b->list);
713                 }
714
715                 mutex_unlock(&bc->lock);
716
717                 trace_and_count(c, btree_cache_cannibalize, c);
718                 goto out;
719         }
720
721         mutex_unlock(&bc->lock);
722         memalloc_nofs_restore(flags);
723         return ERR_PTR(-ENOMEM);
724 }
725
726 /* Slowpath, don't want it inlined into btree_iter_traverse() */
727 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
728                                 struct btree_trans *trans,
729                                 struct btree_path *path,
730                                 const struct bkey_i *k,
731                                 enum btree_id btree_id,
732                                 unsigned level,
733                                 enum six_lock_type lock_type,
734                                 bool sync)
735 {
736         struct btree_cache *bc = &c->btree_cache;
737         struct btree *b;
738         u32 seq;
739
740         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
741         /*
742          * Parent node must be locked, else we could read in a btree node that's
743          * been freed:
744          */
745         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
746                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
747                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
748         }
749
750         b = bch2_btree_node_mem_alloc(c, level != 0);
751
752         if (trans && b == ERR_PTR(-ENOMEM)) {
753                 trans->memory_allocation_failure = true;
754                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
755                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
756         }
757
758         if (IS_ERR(b))
759                 return b;
760
761         /*
762          * Btree nodes read in from disk should not have the accessed bit set
763          * initially, so that linear scans don't thrash the cache:
764          */
765         clear_btree_node_accessed(b);
766
767         bkey_copy(&b->key, k);
768         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
769                 /* raced with another fill: */
770
771                 /* mark as unhashed... */
772                 b->hash_val = 0;
773
774                 mutex_lock(&bc->lock);
775                 list_add(&b->list, &bc->freeable);
776                 mutex_unlock(&bc->lock);
777
778                 six_unlock_write(&b->c.lock);
779                 six_unlock_intent(&b->c.lock);
780                 return NULL;
781         }
782
783         set_btree_node_read_in_flight(b);
784
785         six_unlock_write(&b->c.lock);
786         seq = b->c.lock.state.seq;
787         six_unlock_intent(&b->c.lock);
788
789         /* Unlock before doing IO: */
790         if (trans && sync)
791                 bch2_trans_unlock(trans);
792
793         bch2_btree_node_read(c, b, sync);
794
795         if (!sync)
796                 return NULL;
797
798         if (trans) {
799                 int ret = bch2_trans_relock(trans) ?:
800                         bch2_btree_path_relock_intent(trans, path);
801                 if (ret) {
802                         BUG_ON(!trans->restarted);
803                         return ERR_PTR(ret);
804                 }
805         }
806
807         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
808                 if (trans)
809                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
810                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
811         }
812
813         return b;
814 }
815
816 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
817 {
818         struct printbuf buf = PRINTBUF;
819
820         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
821                 return;
822
823         prt_printf(&buf,
824                "btree node header doesn't match ptr\n"
825                "btree %s level %u\n"
826                "ptr: ",
827                bch2_btree_ids[b->c.btree_id], b->c.level);
828         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
829
830         prt_printf(&buf, "\nheader: btree %s level %llu\n"
831                "min ",
832                bch2_btree_ids[BTREE_NODE_ID(b->data)],
833                BTREE_NODE_LEVEL(b->data));
834         bch2_bpos_to_text(&buf, b->data->min_key);
835
836         prt_printf(&buf, "\nmax ");
837         bch2_bpos_to_text(&buf, b->data->max_key);
838
839         bch2_fs_inconsistent(c, "%s", buf.buf);
840         printbuf_exit(&buf);
841 }
842
843 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
844 {
845         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
846             b->c.level != BTREE_NODE_LEVEL(b->data) ||
847             !bpos_eq(b->data->max_key, b->key.k.p) ||
848             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
849              !bpos_eq(b->data->min_key,
850                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
851                 btree_bad_header(c, b);
852 }
853
854 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
855                                            const struct bkey_i *k, unsigned level,
856                                            enum six_lock_type lock_type,
857                                            unsigned long trace_ip)
858 {
859         struct bch_fs *c = trans->c;
860         struct btree_cache *bc = &c->btree_cache;
861         struct btree *b;
862         struct bset_tree *t;
863         int ret;
864
865         EBUG_ON(level >= BTREE_MAX_DEPTH);
866 retry:
867         b = btree_cache_find(bc, k);
868         if (unlikely(!b)) {
869                 /*
870                  * We must have the parent locked to call bch2_btree_node_fill(),
871                  * else we could read in a btree node from disk that's been
872                  * freed:
873                  */
874                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
875                                          level, lock_type, true);
876
877                 /* We raced and found the btree node in the cache */
878                 if (!b)
879                         goto retry;
880
881                 if (IS_ERR(b))
882                         return b;
883         } else {
884                 if (btree_node_read_locked(path, level + 1))
885                         btree_node_unlock(trans, path, level + 1);
886
887                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
888                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
889                         return ERR_PTR(ret);
890
891                 BUG_ON(ret);
892
893                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
894                              b->c.level != level ||
895                              race_fault())) {
896                         six_unlock_type(&b->c.lock, lock_type);
897                         if (bch2_btree_node_relock(trans, path, level + 1))
898                                 goto retry;
899
900                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
901                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
902                 }
903
904                 /* avoid atomic set bit if it's not needed: */
905                 if (!btree_node_accessed(b))
906                         set_btree_node_accessed(b);
907         }
908
909         if (unlikely(btree_node_read_in_flight(b))) {
910                 u32 seq = b->c.lock.state.seq;
911
912                 six_unlock_type(&b->c.lock, lock_type);
913                 bch2_trans_unlock(trans);
914
915                 bch2_btree_node_wait_on_read(b);
916
917                 /*
918                  * should_be_locked is not set on this path yet, so we need to
919                  * relock it specifically:
920                  */
921                 if (trans) {
922                         int ret = bch2_trans_relock(trans) ?:
923                                 bch2_btree_path_relock_intent(trans, path);
924                         if (ret) {
925                                 BUG_ON(!trans->restarted);
926                                 return ERR_PTR(ret);
927                         }
928                 }
929
930                 if (!six_relock_type(&b->c.lock, lock_type, seq))
931                         goto retry;
932         }
933
934         prefetch(b->aux_data);
935
936         for_each_bset(b, t) {
937                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
938
939                 prefetch(p + L1_CACHE_BYTES * 0);
940                 prefetch(p + L1_CACHE_BYTES * 1);
941                 prefetch(p + L1_CACHE_BYTES * 2);
942         }
943
944         if (unlikely(btree_node_read_error(b))) {
945                 six_unlock_type(&b->c.lock, lock_type);
946                 return ERR_PTR(-EIO);
947         }
948
949         EBUG_ON(b->c.btree_id != path->btree_id);
950         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
951         btree_check_header(c, b);
952
953         return b;
954 }
955
956 /**
957  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
958  * in from disk if necessary.
959  *
960  * The btree node will have either a read or a write lock held, depending on
961  * the @write parameter.
962  */
963 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
964                                   const struct bkey_i *k, unsigned level,
965                                   enum six_lock_type lock_type,
966                                   unsigned long trace_ip)
967 {
968         struct bch_fs *c = trans->c;
969         struct btree *b;
970         struct bset_tree *t;
971         int ret;
972
973         EBUG_ON(level >= BTREE_MAX_DEPTH);
974
975         b = btree_node_mem_ptr(k);
976
977         /*
978          * Check b->hash_val _before_ calling btree_node_lock() - this might not
979          * be the node we want anymore, and trying to lock the wrong node could
980          * cause an unneccessary transaction restart:
981          */
982         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
983                      !b ||
984                      b->hash_val != btree_ptr_hash_val(k)))
985                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
986
987         if (btree_node_read_locked(path, level + 1))
988                 btree_node_unlock(trans, path, level + 1);
989
990         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
991         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
992                 return ERR_PTR(ret);
993
994         BUG_ON(ret);
995
996         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
997                      b->c.level != level ||
998                      race_fault())) {
999                 six_unlock_type(&b->c.lock, lock_type);
1000                 if (bch2_btree_node_relock(trans, path, level + 1))
1001                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1002
1003                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1004                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1005         }
1006
1007         if (unlikely(btree_node_read_in_flight(b))) {
1008                 u32 seq = b->c.lock.state.seq;
1009
1010                 six_unlock_type(&b->c.lock, lock_type);
1011                 bch2_trans_unlock(trans);
1012
1013                 bch2_btree_node_wait_on_read(b);
1014
1015                 /*
1016                  * should_be_locked is not set on this path yet, so we need to
1017                  * relock it specifically:
1018                  */
1019                 if (trans) {
1020                         int ret = bch2_trans_relock(trans) ?:
1021                                 bch2_btree_path_relock_intent(trans, path);
1022                         if (ret) {
1023                                 BUG_ON(!trans->restarted);
1024                                 return ERR_PTR(ret);
1025                         }
1026                 }
1027
1028                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1029                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1030         }
1031
1032         prefetch(b->aux_data);
1033
1034         for_each_bset(b, t) {
1035                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1036
1037                 prefetch(p + L1_CACHE_BYTES * 0);
1038                 prefetch(p + L1_CACHE_BYTES * 1);
1039                 prefetch(p + L1_CACHE_BYTES * 2);
1040         }
1041
1042         /* avoid atomic set bit if it's not needed: */
1043         if (!btree_node_accessed(b))
1044                 set_btree_node_accessed(b);
1045
1046         if (unlikely(btree_node_read_error(b))) {
1047                 six_unlock_type(&b->c.lock, lock_type);
1048                 return ERR_PTR(-EIO);
1049         }
1050
1051         EBUG_ON(b->c.btree_id != path->btree_id);
1052         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1053         btree_check_header(c, b);
1054
1055         return b;
1056 }
1057
1058 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1059                                          const struct bkey_i *k,
1060                                          enum btree_id btree_id,
1061                                          unsigned level,
1062                                          bool nofill)
1063 {
1064         struct bch_fs *c = trans->c;
1065         struct btree_cache *bc = &c->btree_cache;
1066         struct btree *b;
1067         struct bset_tree *t;
1068         int ret;
1069
1070         EBUG_ON(level >= BTREE_MAX_DEPTH);
1071
1072         if (c->opts.btree_node_mem_ptr_optimization) {
1073                 b = btree_node_mem_ptr(k);
1074                 if (b)
1075                         goto lock_node;
1076         }
1077 retry:
1078         b = btree_cache_find(bc, k);
1079         if (unlikely(!b)) {
1080                 if (nofill)
1081                         goto out;
1082
1083                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1084                                          level, SIX_LOCK_read, true);
1085
1086                 /* We raced and found the btree node in the cache */
1087                 if (!b)
1088                         goto retry;
1089
1090                 if (IS_ERR(b) &&
1091                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1092                         goto retry;
1093
1094                 if (IS_ERR(b))
1095                         goto out;
1096         } else {
1097 lock_node:
1098                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1099                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1100                         return ERR_PTR(ret);
1101
1102                 BUG_ON(ret);
1103
1104                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1105                              b->c.btree_id != btree_id ||
1106                              b->c.level != level)) {
1107                         six_unlock_read(&b->c.lock);
1108                         goto retry;
1109                 }
1110         }
1111
1112         /* XXX: waiting on IO with btree locks held: */
1113         __bch2_btree_node_wait_on_read(b);
1114
1115         prefetch(b->aux_data);
1116
1117         for_each_bset(b, t) {
1118                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1119
1120                 prefetch(p + L1_CACHE_BYTES * 0);
1121                 prefetch(p + L1_CACHE_BYTES * 1);
1122                 prefetch(p + L1_CACHE_BYTES * 2);
1123         }
1124
1125         /* avoid atomic set bit if it's not needed: */
1126         if (!btree_node_accessed(b))
1127                 set_btree_node_accessed(b);
1128
1129         if (unlikely(btree_node_read_error(b))) {
1130                 six_unlock_read(&b->c.lock);
1131                 b = ERR_PTR(-EIO);
1132                 goto out;
1133         }
1134
1135         EBUG_ON(b->c.btree_id != btree_id);
1136         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1137         btree_check_header(c, b);
1138 out:
1139         bch2_btree_cache_cannibalize_unlock(c);
1140         return b;
1141 }
1142
1143 int bch2_btree_node_prefetch(struct bch_fs *c,
1144                              struct btree_trans *trans,
1145                              struct btree_path *path,
1146                              const struct bkey_i *k,
1147                              enum btree_id btree_id, unsigned level)
1148 {
1149         struct btree_cache *bc = &c->btree_cache;
1150         struct btree *b;
1151
1152         BUG_ON(trans && !btree_node_locked(path, level + 1));
1153         BUG_ON(level >= BTREE_MAX_DEPTH);
1154
1155         b = btree_cache_find(bc, k);
1156         if (b)
1157                 return 0;
1158
1159         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1160                                  level, SIX_LOCK_read, false);
1161         return PTR_ERR_OR_ZERO(b);
1162 }
1163
1164 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1165 {
1166         struct bch_fs *c = trans->c;
1167         struct btree_cache *bc = &c->btree_cache;
1168         struct btree *b;
1169
1170         b = btree_cache_find(bc, k);
1171         if (!b)
1172                 return;
1173 wait_on_io:
1174         /* not allowed to wait on io with btree locks held: */
1175
1176         /* XXX we're called from btree_gc which will be holding other btree
1177          * nodes locked
1178          */
1179         __bch2_btree_node_wait_on_read(b);
1180         __bch2_btree_node_wait_on_write(b);
1181
1182         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1183         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1184
1185         if (btree_node_dirty(b)) {
1186                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1187                 six_unlock_write(&b->c.lock);
1188                 six_unlock_intent(&b->c.lock);
1189                 goto wait_on_io;
1190         }
1191
1192         BUG_ON(btree_node_dirty(b));
1193
1194         mutex_lock(&bc->lock);
1195         btree_node_data_free(c, b);
1196         bch2_btree_node_hash_remove(bc, b);
1197         mutex_unlock(&bc->lock);
1198
1199         six_unlock_write(&b->c.lock);
1200         six_unlock_intent(&b->c.lock);
1201 }
1202
1203 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1204                              struct btree *b)
1205 {
1206         const struct bkey_format *f = &b->format;
1207         struct bset_stats stats;
1208
1209         memset(&stats, 0, sizeof(stats));
1210
1211         bch2_btree_keys_stats(b, &stats);
1212
1213         prt_printf(out, "l %u ", b->c.level);
1214         bch2_bpos_to_text(out, b->data->min_key);
1215         prt_printf(out, " - ");
1216         bch2_bpos_to_text(out, b->data->max_key);
1217         prt_printf(out, ":\n"
1218                "    ptrs: ");
1219         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1220
1221         prt_printf(out, "\n"
1222                "    format: u64s %u fields %u %u %u %u %u\n"
1223                "    unpack fn len: %u\n"
1224                "    bytes used %zu/%zu (%zu%% full)\n"
1225                "    sib u64s: %u, %u (merge threshold %u)\n"
1226                "    nr packed keys %u\n"
1227                "    nr unpacked keys %u\n"
1228                "    floats %zu\n"
1229                "    failed unpacked %zu\n",
1230                f->key_u64s,
1231                f->bits_per_field[0],
1232                f->bits_per_field[1],
1233                f->bits_per_field[2],
1234                f->bits_per_field[3],
1235                f->bits_per_field[4],
1236                b->unpack_fn_len,
1237                b->nr.live_u64s * sizeof(u64),
1238                btree_bytes(c) - sizeof(struct btree_node),
1239                b->nr.live_u64s * 100 / btree_max_u64s(c),
1240                b->sib_u64s[0],
1241                b->sib_u64s[1],
1242                c->btree_foreground_merge_threshold,
1243                b->nr.packed_keys,
1244                b->nr.unpacked_keys,
1245                stats.floats,
1246                stats.failed);
1247 }
1248
1249 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1250 {
1251         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1252         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1253         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1254
1255         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1256         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1257         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1258         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1259         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1260         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1261         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1262         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1263         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1264         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1265
1266 }