]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 84505cfd37 bcachefs: Go RW before check_alloc_info()
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12
13 #include <linux/prefetch.h>
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
18 do {                                             \
19         if (shrinker_counter)                    \
20                 bc->not_freed_##counter++;       \
21 } while (0)
22
23 const char * const bch2_btree_node_flags[] = {
24 #define x(f)    #f,
25         BTREE_FLAGS()
26 #undef x
27         NULL
28 };
29
30 void bch2_recalc_btree_reserve(struct bch_fs *c)
31 {
32         unsigned i, reserve = 16;
33
34         if (!c->btree_roots[0].b)
35                 reserve += 8;
36
37         for (i = 0; i < BTREE_ID_NR; i++)
38                 if (c->btree_roots[i].b)
39                         reserve += min_t(unsigned, 1,
40                                          c->btree_roots[i].b->c.level) * 8;
41
42         c->btree_cache.reserve = reserve;
43 }
44
45 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
46 {
47         return max_t(int, 0, bc->used - bc->reserve);
48 }
49
50 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
51 {
52         if (b->c.lock.readers)
53                 list_move(&b->list, &bc->freed_pcpu);
54         else
55                 list_move(&b->list, &bc->freed_nonpcpu);
56 }
57
58 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
59 {
60         struct btree_cache *bc = &c->btree_cache;
61
62         EBUG_ON(btree_node_write_in_flight(b));
63
64         kvpfree(b->data, btree_bytes(c));
65         b->data = NULL;
66 #ifdef __KERNEL__
67         vfree(b->aux_data);
68 #else
69         munmap(b->aux_data, btree_aux_data_bytes(b));
70 #endif
71         b->aux_data = NULL;
72
73         bc->used--;
74
75         btree_node_to_freedlist(bc, b);
76 }
77
78 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
79                                    const void *obj)
80 {
81         const struct btree *b = obj;
82         const u64 *v = arg->key;
83
84         return b->hash_val == *v ? 0 : 1;
85 }
86
87 static const struct rhashtable_params bch_btree_cache_params = {
88         .head_offset    = offsetof(struct btree, hash),
89         .key_offset     = offsetof(struct btree, hash_val),
90         .key_len        = sizeof(u64),
91         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
92 };
93
94 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
95 {
96         BUG_ON(b->data || b->aux_data);
97
98         b->data = kvpmalloc(btree_bytes(c), gfp);
99         if (!b->data)
100                 return -ENOMEM;
101 #ifdef __KERNEL__
102         b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
103 #else
104         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
105                            PROT_READ|PROT_WRITE|PROT_EXEC,
106                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
107         if (b->aux_data == MAP_FAILED)
108                 b->aux_data = NULL;
109 #endif
110         if (!b->aux_data) {
111                 kvpfree(b->data, btree_bytes(c));
112                 b->data = NULL;
113                 return -ENOMEM;
114         }
115
116         return 0;
117 }
118
119 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
120 {
121         struct btree *b;
122
123         b = kzalloc(sizeof(struct btree), gfp);
124         if (!b)
125                 return NULL;
126
127         bkey_btree_ptr_init(&b->key);
128         __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
129 #ifdef CONFIG_DEBUG_LOCK_ALLOC
130         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
131 #endif
132         INIT_LIST_HEAD(&b->list);
133         INIT_LIST_HEAD(&b->write_blocked);
134         b->byte_order = ilog2(btree_bytes(c));
135         return b;
136 }
137
138 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
139 {
140         struct btree_cache *bc = &c->btree_cache;
141         struct btree *b;
142
143         b = __btree_node_mem_alloc(c, GFP_KERNEL);
144         if (!b)
145                 return NULL;
146
147         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
148                 kfree(b);
149                 return NULL;
150         }
151
152         bc->used++;
153         list_add(&b->list, &bc->freeable);
154         return b;
155 }
156
157 /* Btree in memory cache - hash table */
158
159 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
160 {
161         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
162
163         BUG_ON(ret);
164
165         /* Cause future lookups for this node to fail: */
166         b->hash_val = 0;
167 }
168
169 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
170 {
171         BUG_ON(b->hash_val);
172         b->hash_val = btree_ptr_hash_val(&b->key);
173
174         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
175                                              bch_btree_cache_params);
176 }
177
178 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
179                                 unsigned level, enum btree_id id)
180 {
181         int ret;
182
183         b->c.level      = level;
184         b->c.btree_id   = id;
185
186         mutex_lock(&bc->lock);
187         ret = __bch2_btree_node_hash_insert(bc, b);
188         if (!ret)
189                 list_add_tail(&b->list, &bc->live);
190         mutex_unlock(&bc->lock);
191
192         return ret;
193 }
194
195 __flatten
196 static inline struct btree *btree_cache_find(struct btree_cache *bc,
197                                      const struct bkey_i *k)
198 {
199         u64 v = btree_ptr_hash_val(k);
200
201         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
202 }
203
204 /*
205  * this version is for btree nodes that have already been freed (we're not
206  * reaping a real btree node)
207  */
208 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
209 {
210         struct btree_cache *bc = &c->btree_cache;
211         int ret = 0;
212
213         lockdep_assert_held(&bc->lock);
214 wait_on_io:
215         if (b->flags & ((1U << BTREE_NODE_dirty)|
216                         (1U << BTREE_NODE_read_in_flight)|
217                         (1U << BTREE_NODE_write_in_flight))) {
218                 if (!flush) {
219                         if (btree_node_dirty(b))
220                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
221                         else if (btree_node_read_in_flight(b))
222                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
223                         else if (btree_node_write_in_flight(b))
224                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
225                         return -ENOMEM;
226                 }
227
228                 /* XXX: waiting on IO with btree cache lock held */
229                 bch2_btree_node_wait_on_read(b);
230                 bch2_btree_node_wait_on_write(b);
231         }
232
233         if (!six_trylock_intent(&b->c.lock)) {
234                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
235                 return -ENOMEM;
236         }
237
238         if (!six_trylock_write(&b->c.lock)) {
239                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
240                 goto out_unlock_intent;
241         }
242
243         /* recheck under lock */
244         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
245                         (1U << BTREE_NODE_write_in_flight))) {
246                 if (!flush) {
247                         if (btree_node_read_in_flight(b))
248                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
249                         else if (btree_node_write_in_flight(b))
250                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
251                         goto out_unlock;
252                 }
253                 six_unlock_write(&b->c.lock);
254                 six_unlock_intent(&b->c.lock);
255                 goto wait_on_io;
256         }
257
258         if (btree_node_noevict(b)) {
259                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
260                 goto out_unlock;
261         }
262         if (btree_node_write_blocked(b)) {
263                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
264                 goto out_unlock;
265         }
266         if (btree_node_will_make_reachable(b)) {
267                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
268                 goto out_unlock;
269         }
270
271         if (btree_node_dirty(b)) {
272                 if (!flush) {
273                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
274                         goto out_unlock;
275                 }
276                 /*
277                  * Using the underscore version because we don't want to compact
278                  * bsets after the write, since this node is about to be evicted
279                  * - unless btree verify mode is enabled, since it runs out of
280                  * the post write cleanup:
281                  */
282                 if (bch2_verify_btree_ondisk)
283                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
284                                               BTREE_WRITE_cache_reclaim);
285                 else
286                         __bch2_btree_node_write(c, b,
287                                                 BTREE_WRITE_cache_reclaim);
288
289                 six_unlock_write(&b->c.lock);
290                 six_unlock_intent(&b->c.lock);
291                 goto wait_on_io;
292         }
293 out:
294         if (b->hash_val && !ret)
295                 trace_and_count(c, btree_cache_reap, c, b);
296         return ret;
297 out_unlock:
298         six_unlock_write(&b->c.lock);
299 out_unlock_intent:
300         six_unlock_intent(&b->c.lock);
301         ret = -ENOMEM;
302         goto out;
303 }
304
305 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
306 {
307         return __btree_node_reclaim(c, b, false, shrinker_counter);
308 }
309
310 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
311 {
312         return __btree_node_reclaim(c, b, true, false);
313 }
314
315 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
316                                            struct shrink_control *sc)
317 {
318         struct bch_fs *c = container_of(shrink, struct bch_fs,
319                                         btree_cache.shrink);
320         struct btree_cache *bc = &c->btree_cache;
321         struct btree *b, *t;
322         unsigned long nr = sc->nr_to_scan;
323         unsigned long can_free = 0;
324         unsigned long freed = 0;
325         unsigned long touched = 0;
326         unsigned i, flags;
327         unsigned long ret = SHRINK_STOP;
328         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
329                 bc->used * 3 / 4;
330
331         if (bch2_btree_shrinker_disabled)
332                 return SHRINK_STOP;
333
334         mutex_lock(&bc->lock);
335         flags = memalloc_nofs_save();
336
337         /*
338          * It's _really_ critical that we don't free too many btree nodes - we
339          * have to always leave ourselves a reserve. The reserve is how we
340          * guarantee that allocating memory for a new btree node can always
341          * succeed, so that inserting keys into the btree can always succeed and
342          * IO can always make forward progress:
343          */
344         can_free = btree_cache_can_free(bc);
345         nr = min_t(unsigned long, nr, can_free);
346
347         i = 0;
348         list_for_each_entry_safe(b, t, &bc->freeable, list) {
349                 /*
350                  * Leave a few nodes on the freeable list, so that a btree split
351                  * won't have to hit the system allocator:
352                  */
353                 if (++i <= 3)
354                         continue;
355
356                 touched++;
357
358                 if (touched >= nr)
359                         goto out;
360
361                 if (!btree_node_reclaim(c, b, true)) {
362                         btree_node_data_free(c, b);
363                         six_unlock_write(&b->c.lock);
364                         six_unlock_intent(&b->c.lock);
365                         freed++;
366                         bc->freed++;
367                 }
368         }
369 restart:
370         list_for_each_entry_safe(b, t, &bc->live, list) {
371                 touched++;
372
373                 if (btree_node_accessed(b)) {
374                         clear_btree_node_accessed(b);
375                         bc->not_freed_access_bit++;
376                 } else if (!btree_node_reclaim(c, b, true)) {
377                         freed++;
378                         btree_node_data_free(c, b);
379                         bc->freed++;
380
381                         bch2_btree_node_hash_remove(bc, b);
382                         six_unlock_write(&b->c.lock);
383                         six_unlock_intent(&b->c.lock);
384
385                         if (freed == nr)
386                                 goto out_rotate;
387                 } else if (trigger_writes &&
388                            btree_node_dirty(b) &&
389                            !btree_node_will_make_reachable(b) &&
390                            !btree_node_write_blocked(b) &&
391                            six_trylock_read(&b->c.lock)) {
392                         list_move(&bc->live, &b->list);
393                         mutex_unlock(&bc->lock);
394                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
395                         six_unlock_read(&b->c.lock);
396                         if (touched >= nr)
397                                 goto out_nounlock;
398                         mutex_lock(&bc->lock);
399                         goto restart;
400                 }
401
402                 if (touched >= nr)
403                         break;
404         }
405 out_rotate:
406         if (&t->list != &bc->live)
407                 list_move_tail(&bc->live, &t->list);
408 out:
409         mutex_unlock(&bc->lock);
410 out_nounlock:
411         ret = freed;
412         memalloc_nofs_restore(flags);
413         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
414         return ret;
415 }
416
417 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
418                                             struct shrink_control *sc)
419 {
420         struct bch_fs *c = container_of(shrink, struct bch_fs,
421                                         btree_cache.shrink);
422         struct btree_cache *bc = &c->btree_cache;
423
424         if (bch2_btree_shrinker_disabled)
425                 return 0;
426
427         return btree_cache_can_free(bc);
428 }
429
430 static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
431 {
432         struct bch_fs *c = container_of(shrink, struct bch_fs,
433                                         btree_cache.shrink);
434
435         bch2_btree_cache_to_text(out, &c->btree_cache);
436 }
437
438 void bch2_fs_btree_cache_exit(struct bch_fs *c)
439 {
440         struct btree_cache *bc = &c->btree_cache;
441         struct btree *b;
442         unsigned i, flags;
443
444         if (bc->shrink.list.next)
445                 unregister_shrinker(&bc->shrink);
446
447         /* vfree() can allocate memory: */
448         flags = memalloc_nofs_save();
449         mutex_lock(&bc->lock);
450
451         if (c->verify_data)
452                 list_move(&c->verify_data->list, &bc->live);
453
454         kvpfree(c->verify_ondisk, btree_bytes(c));
455
456         for (i = 0; i < BTREE_ID_NR; i++)
457                 if (c->btree_roots[i].b)
458                         list_add(&c->btree_roots[i].b->list, &bc->live);
459
460         list_splice(&bc->freeable, &bc->live);
461
462         while (!list_empty(&bc->live)) {
463                 b = list_first_entry(&bc->live, struct btree, list);
464
465                 BUG_ON(btree_node_read_in_flight(b) ||
466                        btree_node_write_in_flight(b));
467
468                 if (btree_node_dirty(b))
469                         bch2_btree_complete_write(c, b, btree_current_write(b));
470                 clear_btree_node_dirty_acct(c, b);
471
472                 btree_node_data_free(c, b);
473         }
474
475         BUG_ON(atomic_read(&c->btree_cache.dirty));
476
477         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
478
479         while (!list_empty(&bc->freed_nonpcpu)) {
480                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
481                 list_del(&b->list);
482                 six_lock_pcpu_free(&b->c.lock);
483                 kfree(b);
484         }
485
486         mutex_unlock(&bc->lock);
487         memalloc_nofs_restore(flags);
488
489         if (bc->table_init_done)
490                 rhashtable_destroy(&bc->table);
491 }
492
493 int bch2_fs_btree_cache_init(struct bch_fs *c)
494 {
495         struct btree_cache *bc = &c->btree_cache;
496         unsigned i;
497         int ret = 0;
498
499         pr_verbose_init(c->opts, "");
500
501         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
502         if (ret)
503                 goto out;
504
505         bc->table_init_done = true;
506
507         bch2_recalc_btree_reserve(c);
508
509         for (i = 0; i < bc->reserve; i++)
510                 if (!__bch2_btree_node_mem_alloc(c)) {
511                         ret = -ENOMEM;
512                         goto out;
513                 }
514
515         list_splice_init(&bc->live, &bc->freeable);
516
517         mutex_init(&c->verify_lock);
518
519         bc->shrink.count_objects        = bch2_btree_cache_count;
520         bc->shrink.scan_objects         = bch2_btree_cache_scan;
521         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
522         bc->shrink.seeks                = 4;
523         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
524 out:
525         pr_verbose_init(c->opts, "ret %i", ret);
526         return ret;
527 }
528
529 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
530 {
531         mutex_init(&bc->lock);
532         INIT_LIST_HEAD(&bc->live);
533         INIT_LIST_HEAD(&bc->freeable);
534         INIT_LIST_HEAD(&bc->freed_pcpu);
535         INIT_LIST_HEAD(&bc->freed_nonpcpu);
536 }
537
538 /*
539  * We can only have one thread cannibalizing other cached btree nodes at a time,
540  * or we'll deadlock. We use an open coded mutex to ensure that, which a
541  * cannibalize_bucket() will take. This means every time we unlock the root of
542  * the btree, we need to release this lock if we have it held.
543  */
544 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
545 {
546         struct btree_cache *bc = &c->btree_cache;
547
548         if (bc->alloc_lock == current) {
549                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
550                 bc->alloc_lock = NULL;
551                 closure_wake_up(&bc->alloc_wait);
552         }
553 }
554
555 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
556 {
557         struct btree_cache *bc = &c->btree_cache;
558         struct task_struct *old;
559
560         old = cmpxchg(&bc->alloc_lock, NULL, current);
561         if (old == NULL || old == current)
562                 goto success;
563
564         if (!cl) {
565                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
566                 return -ENOMEM;
567         }
568
569         closure_wait(&bc->alloc_wait, cl);
570
571         /* Try again, after adding ourselves to waitlist */
572         old = cmpxchg(&bc->alloc_lock, NULL, current);
573         if (old == NULL || old == current) {
574                 /* We raced */
575                 closure_wake_up(&bc->alloc_wait);
576                 goto success;
577         }
578
579         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
580         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
581
582 success:
583         trace_and_count(c, btree_cache_cannibalize_lock, c);
584         return 0;
585 }
586
587 static struct btree *btree_node_cannibalize(struct bch_fs *c)
588 {
589         struct btree_cache *bc = &c->btree_cache;
590         struct btree *b;
591
592         list_for_each_entry_reverse(b, &bc->live, list)
593                 if (!btree_node_reclaim(c, b, false))
594                         return b;
595
596         while (1) {
597                 list_for_each_entry_reverse(b, &bc->live, list)
598                         if (!btree_node_write_and_reclaim(c, b))
599                                 return b;
600
601                 /*
602                  * Rare case: all nodes were intent-locked.
603                  * Just busy-wait.
604                  */
605                 WARN_ONCE(1, "btree cache cannibalize failed\n");
606                 cond_resched();
607         }
608 }
609
610 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks)
611 {
612         struct btree_cache *bc = &c->btree_cache;
613         struct list_head *freed = pcpu_read_locks
614                 ? &bc->freed_pcpu
615                 : &bc->freed_nonpcpu;
616         struct btree *b, *b2;
617         u64 start_time = local_clock();
618         unsigned flags;
619
620         flags = memalloc_nofs_save();
621         mutex_lock(&bc->lock);
622
623         /*
624          * We never free struct btree itself, just the memory that holds the on
625          * disk node. Check the freed list before allocating a new one:
626          */
627         list_for_each_entry(b, freed, list)
628                 if (!btree_node_reclaim(c, b, false)) {
629                         list_del_init(&b->list);
630                         goto got_node;
631                 }
632
633         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
634         if (!b) {
635                 mutex_unlock(&bc->lock);
636                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
637                 if (!b)
638                         goto err;
639                 mutex_lock(&bc->lock);
640         }
641
642         if (pcpu_read_locks)
643                 six_lock_pcpu_alloc(&b->c.lock);
644
645         BUG_ON(!six_trylock_intent(&b->c.lock));
646         BUG_ON(!six_trylock_write(&b->c.lock));
647 got_node:
648
649         /*
650          * btree_free() doesn't free memory; it sticks the node on the end of
651          * the list. Check if there's any freed nodes there:
652          */
653         list_for_each_entry(b2, &bc->freeable, list)
654                 if (!btree_node_reclaim(c, b2, false)) {
655                         swap(b->data, b2->data);
656                         swap(b->aux_data, b2->aux_data);
657                         btree_node_to_freedlist(bc, b2);
658                         six_unlock_write(&b2->c.lock);
659                         six_unlock_intent(&b2->c.lock);
660                         goto got_mem;
661                 }
662
663         mutex_unlock(&bc->lock);
664
665         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
666                 goto err;
667
668         mutex_lock(&bc->lock);
669         bc->used++;
670 got_mem:
671         mutex_unlock(&bc->lock);
672
673         BUG_ON(btree_node_hashed(b));
674         BUG_ON(btree_node_dirty(b));
675         BUG_ON(btree_node_write_in_flight(b));
676 out:
677         b->flags                = 0;
678         b->written              = 0;
679         b->nsets                = 0;
680         b->sib_u64s[0]          = 0;
681         b->sib_u64s[1]          = 0;
682         b->whiteout_u64s        = 0;
683         bch2_btree_keys_init(b);
684         set_btree_node_accessed(b);
685
686         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
687                                start_time);
688
689         memalloc_nofs_restore(flags);
690         return b;
691 err:
692         mutex_lock(&bc->lock);
693
694         /* Try to cannibalize another cached btree node: */
695         if (bc->alloc_lock == current) {
696                 b2 = btree_node_cannibalize(c);
697                 bch2_btree_node_hash_remove(bc, b2);
698
699                 if (b) {
700                         swap(b->data, b2->data);
701                         swap(b->aux_data, b2->aux_data);
702                         btree_node_to_freedlist(bc, b2);
703                         six_unlock_write(&b2->c.lock);
704                         six_unlock_intent(&b2->c.lock);
705                 } else {
706                         b = b2;
707                         list_del_init(&b->list);
708                 }
709
710                 mutex_unlock(&bc->lock);
711
712                 trace_and_count(c, btree_cache_cannibalize, c);
713                 goto out;
714         }
715
716         mutex_unlock(&bc->lock);
717         memalloc_nofs_restore(flags);
718         return ERR_PTR(-ENOMEM);
719 }
720
721 /* Slowpath, don't want it inlined into btree_iter_traverse() */
722 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
723                                 struct btree_trans *trans,
724                                 struct btree_path *path,
725                                 const struct bkey_i *k,
726                                 enum btree_id btree_id,
727                                 unsigned level,
728                                 enum six_lock_type lock_type,
729                                 bool sync)
730 {
731         struct btree_cache *bc = &c->btree_cache;
732         struct btree *b;
733         u32 seq;
734
735         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
736         /*
737          * Parent node must be locked, else we could read in a btree node that's
738          * been freed:
739          */
740         if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
741                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
742                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
743         }
744
745         b = bch2_btree_node_mem_alloc(c, level != 0);
746
747         if (trans && b == ERR_PTR(-ENOMEM)) {
748                 trans->memory_allocation_failure = true;
749                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
750                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
751         }
752
753         if (IS_ERR(b))
754                 return b;
755
756         /*
757          * Btree nodes read in from disk should not have the accessed bit set
758          * initially, so that linear scans don't thrash the cache:
759          */
760         clear_btree_node_accessed(b);
761
762         bkey_copy(&b->key, k);
763         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
764                 /* raced with another fill: */
765
766                 /* mark as unhashed... */
767                 b->hash_val = 0;
768
769                 mutex_lock(&bc->lock);
770                 list_add(&b->list, &bc->freeable);
771                 mutex_unlock(&bc->lock);
772
773                 six_unlock_write(&b->c.lock);
774                 six_unlock_intent(&b->c.lock);
775                 return NULL;
776         }
777
778         set_btree_node_read_in_flight(b);
779
780         six_unlock_write(&b->c.lock);
781         seq = b->c.lock.state.seq;
782         six_unlock_intent(&b->c.lock);
783
784         /* Unlock before doing IO: */
785         if (trans && sync)
786                 bch2_trans_unlock(trans);
787
788         bch2_btree_node_read(c, b, sync);
789
790         if (!sync)
791                 return NULL;
792
793         if (trans) {
794                 int ret = bch2_trans_relock(trans) ?:
795                         bch2_btree_path_relock_intent(trans, path);
796                 if (ret) {
797                         BUG_ON(!trans->restarted);
798                         return ERR_PTR(ret);
799                 }
800         }
801
802         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
803                 if (trans)
804                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
805                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
806         }
807
808         return b;
809 }
810
811 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
812 {
813         struct printbuf buf = PRINTBUF;
814
815         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
816                 return;
817
818         prt_printf(&buf,
819                "btree node header doesn't match ptr\n"
820                "btree %s level %u\n"
821                "ptr: ",
822                bch2_btree_ids[b->c.btree_id], b->c.level);
823         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
824
825         prt_printf(&buf, "\nheader: btree %s level %llu\n"
826                "min ",
827                bch2_btree_ids[BTREE_NODE_ID(b->data)],
828                BTREE_NODE_LEVEL(b->data));
829         bch2_bpos_to_text(&buf, b->data->min_key);
830
831         prt_printf(&buf, "\nmax ");
832         bch2_bpos_to_text(&buf, b->data->max_key);
833
834         bch2_fs_inconsistent(c, "%s", buf.buf);
835         printbuf_exit(&buf);
836 }
837
838 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
839 {
840         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
841             b->c.level != BTREE_NODE_LEVEL(b->data) ||
842             !bpos_eq(b->data->max_key, b->key.k.p) ||
843             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
844              !bpos_eq(b->data->min_key,
845                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
846                 btree_bad_header(c, b);
847 }
848
849 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
850                                            const struct bkey_i *k, unsigned level,
851                                            enum six_lock_type lock_type,
852                                            unsigned long trace_ip)
853 {
854         struct bch_fs *c = trans->c;
855         struct btree_cache *bc = &c->btree_cache;
856         struct btree *b;
857         struct bset_tree *t;
858         int ret;
859
860         EBUG_ON(level >= BTREE_MAX_DEPTH);
861 retry:
862         b = btree_cache_find(bc, k);
863         if (unlikely(!b)) {
864                 /*
865                  * We must have the parent locked to call bch2_btree_node_fill(),
866                  * else we could read in a btree node from disk that's been
867                  * freed:
868                  */
869                 b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
870                                          level, lock_type, true);
871
872                 /* We raced and found the btree node in the cache */
873                 if (!b)
874                         goto retry;
875
876                 if (IS_ERR(b))
877                         return b;
878         } else {
879                 if (btree_node_read_locked(path, level + 1))
880                         btree_node_unlock(trans, path, level + 1);
881
882                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
883                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
884                         return ERR_PTR(ret);
885
886                 BUG_ON(ret);
887
888                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
889                              b->c.level != level ||
890                              race_fault())) {
891                         six_unlock_type(&b->c.lock, lock_type);
892                         if (bch2_btree_node_relock(trans, path, level + 1))
893                                 goto retry;
894
895                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
896                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
897                 }
898
899                 /* avoid atomic set bit if it's not needed: */
900                 if (!btree_node_accessed(b))
901                         set_btree_node_accessed(b);
902         }
903
904         if (unlikely(btree_node_read_in_flight(b))) {
905                 u32 seq = b->c.lock.state.seq;
906
907                 six_unlock_type(&b->c.lock, lock_type);
908                 bch2_trans_unlock(trans);
909
910                 bch2_btree_node_wait_on_read(b);
911
912                 /*
913                  * should_be_locked is not set on this path yet, so we need to
914                  * relock it specifically:
915                  */
916                 if (trans) {
917                         int ret = bch2_trans_relock(trans) ?:
918                                 bch2_btree_path_relock_intent(trans, path);
919                         if (ret) {
920                                 BUG_ON(!trans->restarted);
921                                 return ERR_PTR(ret);
922                         }
923                 }
924
925                 if (!six_relock_type(&b->c.lock, lock_type, seq))
926                         goto retry;
927         }
928
929         prefetch(b->aux_data);
930
931         for_each_bset(b, t) {
932                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
933
934                 prefetch(p + L1_CACHE_BYTES * 0);
935                 prefetch(p + L1_CACHE_BYTES * 1);
936                 prefetch(p + L1_CACHE_BYTES * 2);
937         }
938
939         if (unlikely(btree_node_read_error(b))) {
940                 six_unlock_type(&b->c.lock, lock_type);
941                 return ERR_PTR(-EIO);
942         }
943
944         EBUG_ON(b->c.btree_id != path->btree_id);
945         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
946         btree_check_header(c, b);
947
948         return b;
949 }
950
951 /**
952  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
953  * in from disk if necessary.
954  *
955  * The btree node will have either a read or a write lock held, depending on
956  * the @write parameter.
957  */
958 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
959                                   const struct bkey_i *k, unsigned level,
960                                   enum six_lock_type lock_type,
961                                   unsigned long trace_ip)
962 {
963         struct bch_fs *c = trans->c;
964         struct btree *b;
965         struct bset_tree *t;
966         int ret;
967
968         EBUG_ON(level >= BTREE_MAX_DEPTH);
969
970         b = btree_node_mem_ptr(k);
971
972         /*
973          * Check b->hash_val _before_ calling btree_node_lock() - this might not
974          * be the node we want anymore, and trying to lock the wrong node could
975          * cause an unneccessary transaction restart:
976          */
977         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
978                      !b ||
979                      b->hash_val != btree_ptr_hash_val(k)))
980                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
981
982         if (btree_node_read_locked(path, level + 1))
983                 btree_node_unlock(trans, path, level + 1);
984
985         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
986         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
987                 return ERR_PTR(ret);
988
989         BUG_ON(ret);
990
991         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
992                      b->c.level != level ||
993                      race_fault())) {
994                 six_unlock_type(&b->c.lock, lock_type);
995                 if (bch2_btree_node_relock(trans, path, level + 1))
996                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
997
998                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
999                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1000         }
1001
1002         if (unlikely(btree_node_read_in_flight(b))) {
1003                 u32 seq = b->c.lock.state.seq;
1004
1005                 six_unlock_type(&b->c.lock, lock_type);
1006                 bch2_trans_unlock(trans);
1007
1008                 bch2_btree_node_wait_on_read(b);
1009
1010                 /*
1011                  * should_be_locked is not set on this path yet, so we need to
1012                  * relock it specifically:
1013                  */
1014                 if (trans) {
1015                         int ret = bch2_trans_relock(trans) ?:
1016                                 bch2_btree_path_relock_intent(trans, path);
1017                         if (ret) {
1018                                 BUG_ON(!trans->restarted);
1019                                 return ERR_PTR(ret);
1020                         }
1021                 }
1022
1023                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1024                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1025         }
1026
1027         prefetch(b->aux_data);
1028
1029         for_each_bset(b, t) {
1030                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1031
1032                 prefetch(p + L1_CACHE_BYTES * 0);
1033                 prefetch(p + L1_CACHE_BYTES * 1);
1034                 prefetch(p + L1_CACHE_BYTES * 2);
1035         }
1036
1037         /* avoid atomic set bit if it's not needed: */
1038         if (!btree_node_accessed(b))
1039                 set_btree_node_accessed(b);
1040
1041         if (unlikely(btree_node_read_error(b))) {
1042                 six_unlock_type(&b->c.lock, lock_type);
1043                 return ERR_PTR(-EIO);
1044         }
1045
1046         EBUG_ON(b->c.btree_id != path->btree_id);
1047         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1048         btree_check_header(c, b);
1049
1050         return b;
1051 }
1052
1053 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1054                                          const struct bkey_i *k,
1055                                          enum btree_id btree_id,
1056                                          unsigned level,
1057                                          bool nofill)
1058 {
1059         struct bch_fs *c = trans->c;
1060         struct btree_cache *bc = &c->btree_cache;
1061         struct btree *b;
1062         struct bset_tree *t;
1063         int ret;
1064
1065         EBUG_ON(level >= BTREE_MAX_DEPTH);
1066
1067         if (c->opts.btree_node_mem_ptr_optimization) {
1068                 b = btree_node_mem_ptr(k);
1069                 if (b)
1070                         goto lock_node;
1071         }
1072 retry:
1073         b = btree_cache_find(bc, k);
1074         if (unlikely(!b)) {
1075                 if (nofill)
1076                         goto out;
1077
1078                 b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
1079                                          level, SIX_LOCK_read, true);
1080
1081                 /* We raced and found the btree node in the cache */
1082                 if (!b)
1083                         goto retry;
1084
1085                 if (IS_ERR(b) &&
1086                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1087                         goto retry;
1088
1089                 if (IS_ERR(b))
1090                         goto out;
1091         } else {
1092 lock_node:
1093                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read);
1094                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1095                         return ERR_PTR(ret);
1096
1097                 BUG_ON(ret);
1098
1099                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1100                              b->c.btree_id != btree_id ||
1101                              b->c.level != level)) {
1102                         six_unlock_read(&b->c.lock);
1103                         goto retry;
1104                 }
1105         }
1106
1107         /* XXX: waiting on IO with btree locks held: */
1108         __bch2_btree_node_wait_on_read(b);
1109
1110         prefetch(b->aux_data);
1111
1112         for_each_bset(b, t) {
1113                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1114
1115                 prefetch(p + L1_CACHE_BYTES * 0);
1116                 prefetch(p + L1_CACHE_BYTES * 1);
1117                 prefetch(p + L1_CACHE_BYTES * 2);
1118         }
1119
1120         /* avoid atomic set bit if it's not needed: */
1121         if (!btree_node_accessed(b))
1122                 set_btree_node_accessed(b);
1123
1124         if (unlikely(btree_node_read_error(b))) {
1125                 six_unlock_read(&b->c.lock);
1126                 b = ERR_PTR(-EIO);
1127                 goto out;
1128         }
1129
1130         EBUG_ON(b->c.btree_id != btree_id);
1131         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1132         btree_check_header(c, b);
1133 out:
1134         bch2_btree_cache_cannibalize_unlock(c);
1135         return b;
1136 }
1137
1138 int bch2_btree_node_prefetch(struct bch_fs *c,
1139                              struct btree_trans *trans,
1140                              struct btree_path *path,
1141                              const struct bkey_i *k,
1142                              enum btree_id btree_id, unsigned level)
1143 {
1144         struct btree_cache *bc = &c->btree_cache;
1145         struct btree *b;
1146
1147         BUG_ON(trans && !btree_node_locked(path, level + 1));
1148         BUG_ON(level >= BTREE_MAX_DEPTH);
1149
1150         b = btree_cache_find(bc, k);
1151         if (b)
1152                 return 0;
1153
1154         b = bch2_btree_node_fill(c, trans, path, k, btree_id,
1155                                  level, SIX_LOCK_read, false);
1156         return PTR_ERR_OR_ZERO(b);
1157 }
1158
1159 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1160 {
1161         struct bch_fs *c = trans->c;
1162         struct btree_cache *bc = &c->btree_cache;
1163         struct btree *b;
1164
1165         b = btree_cache_find(bc, k);
1166         if (!b)
1167                 return;
1168 wait_on_io:
1169         /* not allowed to wait on io with btree locks held: */
1170
1171         /* XXX we're called from btree_gc which will be holding other btree
1172          * nodes locked
1173          */
1174         __bch2_btree_node_wait_on_read(b);
1175         __bch2_btree_node_wait_on_write(b);
1176
1177         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1178         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1179
1180         if (btree_node_dirty(b)) {
1181                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1182                 six_unlock_write(&b->c.lock);
1183                 six_unlock_intent(&b->c.lock);
1184                 goto wait_on_io;
1185         }
1186
1187         BUG_ON(btree_node_dirty(b));
1188
1189         mutex_lock(&bc->lock);
1190         btree_node_data_free(c, b);
1191         bch2_btree_node_hash_remove(bc, b);
1192         mutex_unlock(&bc->lock);
1193
1194         six_unlock_write(&b->c.lock);
1195         six_unlock_intent(&b->c.lock);
1196 }
1197
1198 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1199                              struct btree *b)
1200 {
1201         const struct bkey_format *f = &b->format;
1202         struct bset_stats stats;
1203
1204         memset(&stats, 0, sizeof(stats));
1205
1206         bch2_btree_keys_stats(b, &stats);
1207
1208         prt_printf(out, "l %u ", b->c.level);
1209         bch2_bpos_to_text(out, b->data->min_key);
1210         prt_printf(out, " - ");
1211         bch2_bpos_to_text(out, b->data->max_key);
1212         prt_printf(out, ":\n"
1213                "    ptrs: ");
1214         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1215
1216         prt_printf(out, "\n"
1217                "    format: u64s %u fields %u %u %u %u %u\n"
1218                "    unpack fn len: %u\n"
1219                "    bytes used %zu/%zu (%zu%% full)\n"
1220                "    sib u64s: %u, %u (merge threshold %u)\n"
1221                "    nr packed keys %u\n"
1222                "    nr unpacked keys %u\n"
1223                "    floats %zu\n"
1224                "    failed unpacked %zu\n",
1225                f->key_u64s,
1226                f->bits_per_field[0],
1227                f->bits_per_field[1],
1228                f->bits_per_field[2],
1229                f->bits_per_field[3],
1230                f->bits_per_field[4],
1231                b->unpack_fn_len,
1232                b->nr.live_u64s * sizeof(u64),
1233                btree_bytes(c) - sizeof(struct btree_node),
1234                b->nr.live_u64s * 100 / btree_max_u64s(c),
1235                b->sib_u64s[0],
1236                b->sib_u64s[1],
1237                c->btree_foreground_merge_threshold,
1238                b->nr.packed_keys,
1239                b->nr.unpacked_keys,
1240                stats.floats,
1241                stats.failed);
1242 }
1243
1244 void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc)
1245 {
1246         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1247         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1248         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1249
1250         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1251         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1252         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1253         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1254         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1255         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1256         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1257         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1258         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1259         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1260
1261 }