]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 84f132d569 bcachefs: fsck: Break walk_inode() up into...
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12 #include "trace.h"
13
14 #include <linux/prefetch.h>
15 #include <linux/sched/mm.h>
16 #include <linux/seq_buf.h>
17
18 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
19 do {                                             \
20         if (shrinker_counter)                    \
21                 bc->not_freed_##counter++;       \
22 } while (0)
23
24 const char * const bch2_btree_node_flags[] = {
25 #define x(f)    #f,
26         BTREE_FLAGS()
27 #undef x
28         NULL
29 };
30
31 void bch2_recalc_btree_reserve(struct bch_fs *c)
32 {
33         unsigned i, reserve = 16;
34
35         if (!c->btree_roots[0].b)
36                 reserve += 8;
37
38         for (i = 0; i < BTREE_ID_NR; i++)
39                 if (c->btree_roots[i].b)
40                         reserve += min_t(unsigned, 1,
41                                          c->btree_roots[i].b->c.level) * 8;
42
43         c->btree_cache.reserve = reserve;
44 }
45
46 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
47 {
48         return max_t(int, 0, bc->used - bc->reserve);
49 }
50
51 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
52 {
53         if (b->c.lock.readers)
54                 list_move(&b->list, &bc->freed_pcpu);
55         else
56                 list_move(&b->list, &bc->freed_nonpcpu);
57 }
58
59 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
60 {
61         struct btree_cache *bc = &c->btree_cache;
62
63         EBUG_ON(btree_node_write_in_flight(b));
64
65         clear_btree_node_just_written(b);
66
67         kvpfree(b->data, btree_bytes(c));
68         b->data = NULL;
69 #ifdef __KERNEL__
70         kvfree(b->aux_data);
71 #else
72         munmap(b->aux_data, btree_aux_data_bytes(b));
73 #endif
74         b->aux_data = NULL;
75
76         bc->used--;
77
78         btree_node_to_freedlist(bc, b);
79 }
80
81 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
82                                    const void *obj)
83 {
84         const struct btree *b = obj;
85         const u64 *v = arg->key;
86
87         return b->hash_val == *v ? 0 : 1;
88 }
89
90 static const struct rhashtable_params bch_btree_cache_params = {
91         .head_offset    = offsetof(struct btree, hash),
92         .key_offset     = offsetof(struct btree, hash_val),
93         .key_len        = sizeof(u64),
94         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
95 };
96
97 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
98 {
99         BUG_ON(b->data || b->aux_data);
100
101         b->data = kvpmalloc(btree_bytes(c), gfp);
102         if (!b->data)
103                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
104 #ifdef __KERNEL__
105         b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
106 #else
107         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
108                            PROT_READ|PROT_WRITE|PROT_EXEC,
109                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
110         if (b->aux_data == MAP_FAILED)
111                 b->aux_data = NULL;
112 #endif
113         if (!b->aux_data) {
114                 kvpfree(b->data, btree_bytes(c));
115                 b->data = NULL;
116                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
117         }
118
119         return 0;
120 }
121
122 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
123 {
124         struct btree *b;
125
126         b = kzalloc(sizeof(struct btree), gfp);
127         if (!b)
128                 return NULL;
129
130         bkey_btree_ptr_init(&b->key);
131         INIT_LIST_HEAD(&b->list);
132         INIT_LIST_HEAD(&b->write_blocked);
133         b->byte_order = ilog2(btree_bytes(c));
134         return b;
135 }
136
137 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
138 {
139         struct btree_cache *bc = &c->btree_cache;
140         struct btree *b;
141
142         b = __btree_node_mem_alloc(c, GFP_KERNEL);
143         if (!b)
144                 return NULL;
145
146         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
147                 kfree(b);
148                 return NULL;
149         }
150
151         bch2_btree_lock_init(&b->c, 0);
152
153         bc->used++;
154         list_add(&b->list, &bc->freeable);
155         return b;
156 }
157
158 /* Btree in memory cache - hash table */
159
160 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
161 {
162         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
163
164         BUG_ON(ret);
165
166         /* Cause future lookups for this node to fail: */
167         b->hash_val = 0;
168 }
169
170 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
171 {
172         BUG_ON(b->hash_val);
173         b->hash_val = btree_ptr_hash_val(&b->key);
174
175         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
176                                              bch_btree_cache_params);
177 }
178
179 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
180                                 unsigned level, enum btree_id id)
181 {
182         int ret;
183
184         b->c.level      = level;
185         b->c.btree_id   = id;
186
187         mutex_lock(&bc->lock);
188         ret = __bch2_btree_node_hash_insert(bc, b);
189         if (!ret)
190                 list_add_tail(&b->list, &bc->live);
191         mutex_unlock(&bc->lock);
192
193         return ret;
194 }
195
196 __flatten
197 static inline struct btree *btree_cache_find(struct btree_cache *bc,
198                                      const struct bkey_i *k)
199 {
200         u64 v = btree_ptr_hash_val(k);
201
202         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
203 }
204
205 /*
206  * this version is for btree nodes that have already been freed (we're not
207  * reaping a real btree node)
208  */
209 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
210 {
211         struct btree_cache *bc = &c->btree_cache;
212         int ret = 0;
213
214         lockdep_assert_held(&bc->lock);
215 wait_on_io:
216         if (b->flags & ((1U << BTREE_NODE_dirty)|
217                         (1U << BTREE_NODE_read_in_flight)|
218                         (1U << BTREE_NODE_write_in_flight))) {
219                 if (!flush) {
220                         if (btree_node_dirty(b))
221                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
222                         else if (btree_node_read_in_flight(b))
223                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
224                         else if (btree_node_write_in_flight(b))
225                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
226                         return -BCH_ERR_ENOMEM_btree_node_reclaim;
227                 }
228
229                 /* XXX: waiting on IO with btree cache lock held */
230                 bch2_btree_node_wait_on_read(b);
231                 bch2_btree_node_wait_on_write(b);
232         }
233
234         if (!six_trylock_intent(&b->c.lock)) {
235                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
236                 return -BCH_ERR_ENOMEM_btree_node_reclaim;
237         }
238
239         if (!six_trylock_write(&b->c.lock)) {
240                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
241                 goto out_unlock_intent;
242         }
243
244         /* recheck under lock */
245         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
246                         (1U << BTREE_NODE_write_in_flight))) {
247                 if (!flush) {
248                         if (btree_node_read_in_flight(b))
249                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
250                         else if (btree_node_write_in_flight(b))
251                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
252                         goto out_unlock;
253                 }
254                 six_unlock_write(&b->c.lock);
255                 six_unlock_intent(&b->c.lock);
256                 goto wait_on_io;
257         }
258
259         if (btree_node_noevict(b)) {
260                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
261                 goto out_unlock;
262         }
263         if (btree_node_write_blocked(b)) {
264                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
265                 goto out_unlock;
266         }
267         if (btree_node_will_make_reachable(b)) {
268                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
269                 goto out_unlock;
270         }
271
272         if (btree_node_dirty(b)) {
273                 if (!flush) {
274                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
275                         goto out_unlock;
276                 }
277                 /*
278                  * Using the underscore version because we don't want to compact
279                  * bsets after the write, since this node is about to be evicted
280                  * - unless btree verify mode is enabled, since it runs out of
281                  * the post write cleanup:
282                  */
283                 if (bch2_verify_btree_ondisk)
284                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
285                                               BTREE_WRITE_cache_reclaim);
286                 else
287                         __bch2_btree_node_write(c, b,
288                                                 BTREE_WRITE_cache_reclaim);
289
290                 six_unlock_write(&b->c.lock);
291                 six_unlock_intent(&b->c.lock);
292                 goto wait_on_io;
293         }
294 out:
295         if (b->hash_val && !ret)
296                 trace_and_count(c, btree_cache_reap, c, b);
297         return ret;
298 out_unlock:
299         six_unlock_write(&b->c.lock);
300 out_unlock_intent:
301         six_unlock_intent(&b->c.lock);
302         ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
303         goto out;
304 }
305
306 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
307 {
308         return __btree_node_reclaim(c, b, false, shrinker_counter);
309 }
310
311 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
312 {
313         return __btree_node_reclaim(c, b, true, false);
314 }
315
316 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
317                                            struct shrink_control *sc)
318 {
319         struct bch_fs *c = container_of(shrink, struct bch_fs,
320                                         btree_cache.shrink);
321         struct btree_cache *bc = &c->btree_cache;
322         struct btree *b, *t;
323         unsigned long nr = sc->nr_to_scan;
324         unsigned long can_free = 0;
325         unsigned long freed = 0;
326         unsigned long touched = 0;
327         unsigned i, flags;
328         unsigned long ret = SHRINK_STOP;
329         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
330                 bc->used * 3 / 4;
331
332         if (bch2_btree_shrinker_disabled)
333                 return SHRINK_STOP;
334
335         mutex_lock(&bc->lock);
336         flags = memalloc_nofs_save();
337
338         /*
339          * It's _really_ critical that we don't free too many btree nodes - we
340          * have to always leave ourselves a reserve. The reserve is how we
341          * guarantee that allocating memory for a new btree node can always
342          * succeed, so that inserting keys into the btree can always succeed and
343          * IO can always make forward progress:
344          */
345         can_free = btree_cache_can_free(bc);
346         nr = min_t(unsigned long, nr, can_free);
347
348         i = 0;
349         list_for_each_entry_safe(b, t, &bc->freeable, list) {
350                 /*
351                  * Leave a few nodes on the freeable list, so that a btree split
352                  * won't have to hit the system allocator:
353                  */
354                 if (++i <= 3)
355                         continue;
356
357                 touched++;
358
359                 if (touched >= nr)
360                         goto out;
361
362                 if (!btree_node_reclaim(c, b, true)) {
363                         btree_node_data_free(c, b);
364                         six_unlock_write(&b->c.lock);
365                         six_unlock_intent(&b->c.lock);
366                         freed++;
367                         bc->freed++;
368                 }
369         }
370 restart:
371         list_for_each_entry_safe(b, t, &bc->live, list) {
372                 touched++;
373
374                 if (btree_node_accessed(b)) {
375                         clear_btree_node_accessed(b);
376                         bc->not_freed_access_bit++;
377                 } else if (!btree_node_reclaim(c, b, true)) {
378                         freed++;
379                         btree_node_data_free(c, b);
380                         bc->freed++;
381
382                         bch2_btree_node_hash_remove(bc, b);
383                         six_unlock_write(&b->c.lock);
384                         six_unlock_intent(&b->c.lock);
385
386                         if (freed == nr)
387                                 goto out_rotate;
388                 } else if (trigger_writes &&
389                            btree_node_dirty(b) &&
390                            !btree_node_will_make_reachable(b) &&
391                            !btree_node_write_blocked(b) &&
392                            six_trylock_read(&b->c.lock)) {
393                         list_move(&bc->live, &b->list);
394                         mutex_unlock(&bc->lock);
395                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
396                         six_unlock_read(&b->c.lock);
397                         if (touched >= nr)
398                                 goto out_nounlock;
399                         mutex_lock(&bc->lock);
400                         goto restart;
401                 }
402
403                 if (touched >= nr)
404                         break;
405         }
406 out_rotate:
407         if (&t->list != &bc->live)
408                 list_move_tail(&bc->live, &t->list);
409 out:
410         mutex_unlock(&bc->lock);
411 out_nounlock:
412         ret = freed;
413         memalloc_nofs_restore(flags);
414         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
415         return ret;
416 }
417
418 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
419                                             struct shrink_control *sc)
420 {
421         struct bch_fs *c = container_of(shrink, struct bch_fs,
422                                         btree_cache.shrink);
423         struct btree_cache *bc = &c->btree_cache;
424
425         if (bch2_btree_shrinker_disabled)
426                 return 0;
427
428         return btree_cache_can_free(bc);
429 }
430
431 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
432 {
433         struct bch_fs *c = container_of(shrink, struct bch_fs,
434                                         btree_cache.shrink);
435         char *cbuf;
436         size_t buflen = seq_buf_get_buf(s, &cbuf);
437         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
438
439         bch2_btree_cache_to_text(&out, &c->btree_cache);
440         seq_buf_commit(s, out.pos);
441 }
442
443 void bch2_fs_btree_cache_exit(struct bch_fs *c)
444 {
445         struct btree_cache *bc = &c->btree_cache;
446         struct btree *b;
447         unsigned i, flags;
448
449         unregister_shrinker(&bc->shrink);
450
451         /* vfree() can allocate memory: */
452         flags = memalloc_nofs_save();
453         mutex_lock(&bc->lock);
454
455         if (c->verify_data)
456                 list_move(&c->verify_data->list, &bc->live);
457
458         kvpfree(c->verify_ondisk, btree_bytes(c));
459
460         for (i = 0; i < BTREE_ID_NR; i++)
461                 if (c->btree_roots[i].b)
462                         list_add(&c->btree_roots[i].b->list, &bc->live);
463
464         list_splice(&bc->freeable, &bc->live);
465
466         while (!list_empty(&bc->live)) {
467                 b = list_first_entry(&bc->live, struct btree, list);
468
469                 BUG_ON(btree_node_read_in_flight(b) ||
470                        btree_node_write_in_flight(b));
471
472                 if (btree_node_dirty(b))
473                         bch2_btree_complete_write(c, b, btree_current_write(b));
474                 clear_btree_node_dirty_acct(c, b);
475
476                 btree_node_data_free(c, b);
477         }
478
479         BUG_ON(atomic_read(&c->btree_cache.dirty));
480
481         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
482
483         while (!list_empty(&bc->freed_nonpcpu)) {
484                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
485                 list_del(&b->list);
486                 six_lock_exit(&b->c.lock);
487                 kfree(b);
488         }
489
490         mutex_unlock(&bc->lock);
491         memalloc_nofs_restore(flags);
492
493         if (bc->table_init_done)
494                 rhashtable_destroy(&bc->table);
495 }
496
497 int bch2_fs_btree_cache_init(struct bch_fs *c)
498 {
499         struct btree_cache *bc = &c->btree_cache;
500         unsigned i;
501         int ret = 0;
502
503         pr_verbose_init(c->opts, "");
504
505         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
506         if (ret)
507                 goto out;
508
509         bc->table_init_done = true;
510
511         bch2_recalc_btree_reserve(c);
512
513         for (i = 0; i < bc->reserve; i++)
514                 if (!__bch2_btree_node_mem_alloc(c)) {
515                         ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
516                         goto out;
517                 }
518
519         list_splice_init(&bc->live, &bc->freeable);
520
521         mutex_init(&c->verify_lock);
522
523         bc->shrink.count_objects        = bch2_btree_cache_count;
524         bc->shrink.scan_objects         = bch2_btree_cache_scan;
525         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
526         bc->shrink.seeks                = 4;
527         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
528 out:
529         pr_verbose_init(c->opts, "ret %i", ret);
530         return ret;
531 }
532
533 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
534 {
535         mutex_init(&bc->lock);
536         INIT_LIST_HEAD(&bc->live);
537         INIT_LIST_HEAD(&bc->freeable);
538         INIT_LIST_HEAD(&bc->freed_pcpu);
539         INIT_LIST_HEAD(&bc->freed_nonpcpu);
540 }
541
542 /*
543  * We can only have one thread cannibalizing other cached btree nodes at a time,
544  * or we'll deadlock. We use an open coded mutex to ensure that, which a
545  * cannibalize_bucket() will take. This means every time we unlock the root of
546  * the btree, we need to release this lock if we have it held.
547  */
548 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
549 {
550         struct btree_cache *bc = &c->btree_cache;
551
552         if (bc->alloc_lock == current) {
553                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
554                 bc->alloc_lock = NULL;
555                 closure_wake_up(&bc->alloc_wait);
556         }
557 }
558
559 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
560 {
561         struct btree_cache *bc = &c->btree_cache;
562         struct task_struct *old;
563
564         old = cmpxchg(&bc->alloc_lock, NULL, current);
565         if (old == NULL || old == current)
566                 goto success;
567
568         if (!cl) {
569                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
570                 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
571         }
572
573         closure_wait(&bc->alloc_wait, cl);
574
575         /* Try again, after adding ourselves to waitlist */
576         old = cmpxchg(&bc->alloc_lock, NULL, current);
577         if (old == NULL || old == current) {
578                 /* We raced */
579                 closure_wake_up(&bc->alloc_wait);
580                 goto success;
581         }
582
583         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
584         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
585
586 success:
587         trace_and_count(c, btree_cache_cannibalize_lock, c);
588         return 0;
589 }
590
591 static struct btree *btree_node_cannibalize(struct bch_fs *c)
592 {
593         struct btree_cache *bc = &c->btree_cache;
594         struct btree *b;
595
596         list_for_each_entry_reverse(b, &bc->live, list)
597                 if (!btree_node_reclaim(c, b, false))
598                         return b;
599
600         while (1) {
601                 list_for_each_entry_reverse(b, &bc->live, list)
602                         if (!btree_node_write_and_reclaim(c, b))
603                                 return b;
604
605                 /*
606                  * Rare case: all nodes were intent-locked.
607                  * Just busy-wait.
608                  */
609                 WARN_ONCE(1, "btree cache cannibalize failed\n");
610                 cond_resched();
611         }
612 }
613
614 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
615 {
616         struct bch_fs *c = trans->c;
617         struct btree_cache *bc = &c->btree_cache;
618         struct list_head *freed = pcpu_read_locks
619                 ? &bc->freed_pcpu
620                 : &bc->freed_nonpcpu;
621         struct btree *b, *b2;
622         u64 start_time = local_clock();
623         unsigned flags;
624
625         flags = memalloc_nofs_save();
626         mutex_lock(&bc->lock);
627
628         /*
629          * We never free struct btree itself, just the memory that holds the on
630          * disk node. Check the freed list before allocating a new one:
631          */
632         list_for_each_entry(b, freed, list)
633                 if (!btree_node_reclaim(c, b, false)) {
634                         list_del_init(&b->list);
635                         goto got_node;
636                 }
637
638         b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
639         if (!b) {
640                 mutex_unlock(&bc->lock);
641                 bch2_trans_unlock(trans);
642                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
643                 if (!b)
644                         goto err;
645                 mutex_lock(&bc->lock);
646         }
647
648         bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
649
650         BUG_ON(!six_trylock_intent(&b->c.lock));
651         BUG_ON(!six_trylock_write(&b->c.lock));
652 got_node:
653
654         /*
655          * btree_free() doesn't free memory; it sticks the node on the end of
656          * the list. Check if there's any freed nodes there:
657          */
658         list_for_each_entry(b2, &bc->freeable, list)
659                 if (!btree_node_reclaim(c, b2, false)) {
660                         swap(b->data, b2->data);
661                         swap(b->aux_data, b2->aux_data);
662                         btree_node_to_freedlist(bc, b2);
663                         six_unlock_write(&b2->c.lock);
664                         six_unlock_intent(&b2->c.lock);
665                         goto got_mem;
666                 }
667
668         mutex_unlock(&bc->lock);
669
670         if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
671                 bch2_trans_unlock(trans);
672                 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
673                         goto err;
674         }
675
676         mutex_lock(&bc->lock);
677         bc->used++;
678 got_mem:
679         mutex_unlock(&bc->lock);
680
681         BUG_ON(btree_node_hashed(b));
682         BUG_ON(btree_node_dirty(b));
683         BUG_ON(btree_node_write_in_flight(b));
684 out:
685         b->flags                = 0;
686         b->written              = 0;
687         b->nsets                = 0;
688         b->sib_u64s[0]          = 0;
689         b->sib_u64s[1]          = 0;
690         b->whiteout_u64s        = 0;
691         bch2_btree_keys_init(b);
692         set_btree_node_accessed(b);
693
694         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
695                                start_time);
696
697         memalloc_nofs_restore(flags);
698         return b;
699 err:
700         mutex_lock(&bc->lock);
701
702         /* Try to cannibalize another cached btree node: */
703         if (bc->alloc_lock == current) {
704                 b2 = btree_node_cannibalize(c);
705                 clear_btree_node_just_written(b2);
706                 bch2_btree_node_hash_remove(bc, b2);
707
708                 if (b) {
709                         swap(b->data, b2->data);
710                         swap(b->aux_data, b2->aux_data);
711                         btree_node_to_freedlist(bc, b2);
712                         six_unlock_write(&b2->c.lock);
713                         six_unlock_intent(&b2->c.lock);
714                 } else {
715                         b = b2;
716                         list_del_init(&b->list);
717                 }
718
719                 mutex_unlock(&bc->lock);
720
721                 trace_and_count(c, btree_cache_cannibalize, c);
722                 goto out;
723         }
724
725         mutex_unlock(&bc->lock);
726         memalloc_nofs_restore(flags);
727         return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
728 }
729
730 /* Slowpath, don't want it inlined into btree_iter_traverse() */
731 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
732                                 struct btree_path *path,
733                                 const struct bkey_i *k,
734                                 enum btree_id btree_id,
735                                 unsigned level,
736                                 enum six_lock_type lock_type,
737                                 bool sync)
738 {
739         struct bch_fs *c = trans->c;
740         struct btree_cache *bc = &c->btree_cache;
741         struct btree *b;
742         u32 seq;
743
744         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
745         /*
746          * Parent node must be locked, else we could read in a btree node that's
747          * been freed:
748          */
749         if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
750                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
751                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
752         }
753
754         b = bch2_btree_node_mem_alloc(trans, level != 0);
755
756         if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
757                 trans->memory_allocation_failure = true;
758                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
759                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
760         }
761
762         if (IS_ERR(b))
763                 return b;
764
765         /*
766          * Btree nodes read in from disk should not have the accessed bit set
767          * initially, so that linear scans don't thrash the cache:
768          */
769         clear_btree_node_accessed(b);
770
771         bkey_copy(&b->key, k);
772         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
773                 /* raced with another fill: */
774
775                 /* mark as unhashed... */
776                 b->hash_val = 0;
777
778                 mutex_lock(&bc->lock);
779                 list_add(&b->list, &bc->freeable);
780                 mutex_unlock(&bc->lock);
781
782                 six_unlock_write(&b->c.lock);
783                 six_unlock_intent(&b->c.lock);
784                 return NULL;
785         }
786
787         set_btree_node_read_in_flight(b);
788
789         six_unlock_write(&b->c.lock);
790         seq = six_lock_seq(&b->c.lock);
791         six_unlock_intent(&b->c.lock);
792
793         /* Unlock before doing IO: */
794         if (trans && sync)
795                 bch2_trans_unlock_noassert(trans);
796
797         bch2_btree_node_read(c, b, sync);
798
799         if (!sync)
800                 return NULL;
801
802         if (path) {
803                 int ret = bch2_trans_relock(trans) ?:
804                         bch2_btree_path_relock_intent(trans, path);
805                 if (ret) {
806                         BUG_ON(!trans->restarted);
807                         return ERR_PTR(ret);
808                 }
809         }
810
811         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
812                 if (path)
813                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
814                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
815         }
816
817         return b;
818 }
819
820 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
821 {
822         struct printbuf buf = PRINTBUF;
823
824         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
825                 return;
826
827         prt_printf(&buf,
828                "btree node header doesn't match ptr\n"
829                "btree %s level %u\n"
830                "ptr: ",
831                bch2_btree_ids[b->c.btree_id], b->c.level);
832         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
833
834         prt_printf(&buf, "\nheader: btree %s level %llu\n"
835                "min ",
836                bch2_btree_ids[BTREE_NODE_ID(b->data)],
837                BTREE_NODE_LEVEL(b->data));
838         bch2_bpos_to_text(&buf, b->data->min_key);
839
840         prt_printf(&buf, "\nmax ");
841         bch2_bpos_to_text(&buf, b->data->max_key);
842
843         bch2_fs_inconsistent(c, "%s", buf.buf);
844         printbuf_exit(&buf);
845 }
846
847 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
848 {
849         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
850             b->c.level != BTREE_NODE_LEVEL(b->data) ||
851             !bpos_eq(b->data->max_key, b->key.k.p) ||
852             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
853              !bpos_eq(b->data->min_key,
854                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
855                 btree_bad_header(c, b);
856 }
857
858 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
859                                            const struct bkey_i *k, unsigned level,
860                                            enum six_lock_type lock_type,
861                                            unsigned long trace_ip)
862 {
863         struct bch_fs *c = trans->c;
864         struct btree_cache *bc = &c->btree_cache;
865         struct btree *b;
866         struct bset_tree *t;
867         bool need_relock = false;
868         int ret;
869
870         EBUG_ON(level >= BTREE_MAX_DEPTH);
871 retry:
872         b = btree_cache_find(bc, k);
873         if (unlikely(!b)) {
874                 /*
875                  * We must have the parent locked to call bch2_btree_node_fill(),
876                  * else we could read in a btree node from disk that's been
877                  * freed:
878                  */
879                 b = bch2_btree_node_fill(trans, path, k, path->btree_id,
880                                          level, lock_type, true);
881                 need_relock = true;
882
883                 /* We raced and found the btree node in the cache */
884                 if (!b)
885                         goto retry;
886
887                 if (IS_ERR(b))
888                         return b;
889         } else {
890                 if (btree_node_read_locked(path, level + 1))
891                         btree_node_unlock(trans, path, level + 1);
892
893                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
894                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
895                         return ERR_PTR(ret);
896
897                 BUG_ON(ret);
898
899                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
900                              b->c.level != level ||
901                              race_fault())) {
902                         six_unlock_type(&b->c.lock, lock_type);
903                         if (bch2_btree_node_relock(trans, path, level + 1))
904                                 goto retry;
905
906                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
907                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
908                 }
909
910                 /* avoid atomic set bit if it's not needed: */
911                 if (!btree_node_accessed(b))
912                         set_btree_node_accessed(b);
913         }
914
915         if (unlikely(btree_node_read_in_flight(b))) {
916                 u32 seq = six_lock_seq(&b->c.lock);
917
918                 six_unlock_type(&b->c.lock, lock_type);
919                 bch2_trans_unlock(trans);
920                 need_relock = true;
921
922                 bch2_btree_node_wait_on_read(b);
923
924                 /*
925                  * should_be_locked is not set on this path yet, so we need to
926                  * relock it specifically:
927                  */
928                 if (!six_relock_type(&b->c.lock, lock_type, seq))
929                         goto retry;
930         }
931
932         if (unlikely(need_relock)) {
933                 int ret = bch2_trans_relock(trans) ?:
934                         bch2_btree_path_relock_intent(trans, path);
935                 if (ret) {
936                         six_unlock_type(&b->c.lock, lock_type);
937                         return ERR_PTR(ret);
938                 }
939         }
940
941         prefetch(b->aux_data);
942
943         for_each_bset(b, t) {
944                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
945
946                 prefetch(p + L1_CACHE_BYTES * 0);
947                 prefetch(p + L1_CACHE_BYTES * 1);
948                 prefetch(p + L1_CACHE_BYTES * 2);
949         }
950
951         if (unlikely(btree_node_read_error(b))) {
952                 six_unlock_type(&b->c.lock, lock_type);
953                 return ERR_PTR(-EIO);
954         }
955
956         EBUG_ON(b->c.btree_id != path->btree_id);
957         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
958         btree_check_header(c, b);
959
960         return b;
961 }
962
963 /**
964  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
965  * in from disk if necessary.
966  *
967  * The btree node will have either a read or a write lock held, depending on
968  * the @write parameter.
969  */
970 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
971                                   const struct bkey_i *k, unsigned level,
972                                   enum six_lock_type lock_type,
973                                   unsigned long trace_ip)
974 {
975         struct bch_fs *c = trans->c;
976         struct btree *b;
977         struct bset_tree *t;
978         int ret;
979
980         EBUG_ON(level >= BTREE_MAX_DEPTH);
981
982         b = btree_node_mem_ptr(k);
983
984         /*
985          * Check b->hash_val _before_ calling btree_node_lock() - this might not
986          * be the node we want anymore, and trying to lock the wrong node could
987          * cause an unneccessary transaction restart:
988          */
989         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
990                      !b ||
991                      b->hash_val != btree_ptr_hash_val(k)))
992                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
993
994         if (btree_node_read_locked(path, level + 1))
995                 btree_node_unlock(trans, path, level + 1);
996
997         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
998         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
999                 return ERR_PTR(ret);
1000
1001         BUG_ON(ret);
1002
1003         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1004                      b->c.level != level ||
1005                      race_fault())) {
1006                 six_unlock_type(&b->c.lock, lock_type);
1007                 if (bch2_btree_node_relock(trans, path, level + 1))
1008                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1009
1010                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1011                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1012         }
1013
1014         if (unlikely(btree_node_read_in_flight(b))) {
1015                 u32 seq = six_lock_seq(&b->c.lock);
1016
1017                 six_unlock_type(&b->c.lock, lock_type);
1018                 bch2_trans_unlock(trans);
1019
1020                 bch2_btree_node_wait_on_read(b);
1021
1022                 /*
1023                  * should_be_locked is not set on this path yet, so we need to
1024                  * relock it specifically:
1025                  */
1026                 if (trans) {
1027                         int ret = bch2_trans_relock(trans) ?:
1028                                 bch2_btree_path_relock_intent(trans, path);
1029                         if (ret) {
1030                                 BUG_ON(!trans->restarted);
1031                                 return ERR_PTR(ret);
1032                         }
1033                 }
1034
1035                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1036                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1037         }
1038
1039         prefetch(b->aux_data);
1040
1041         for_each_bset(b, t) {
1042                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1043
1044                 prefetch(p + L1_CACHE_BYTES * 0);
1045                 prefetch(p + L1_CACHE_BYTES * 1);
1046                 prefetch(p + L1_CACHE_BYTES * 2);
1047         }
1048
1049         /* avoid atomic set bit if it's not needed: */
1050         if (!btree_node_accessed(b))
1051                 set_btree_node_accessed(b);
1052
1053         if (unlikely(btree_node_read_error(b))) {
1054                 six_unlock_type(&b->c.lock, lock_type);
1055                 return ERR_PTR(-EIO);
1056         }
1057
1058         EBUG_ON(b->c.btree_id != path->btree_id);
1059         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1060         btree_check_header(c, b);
1061
1062         return b;
1063 }
1064
1065 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1066                                          const struct bkey_i *k,
1067                                          enum btree_id btree_id,
1068                                          unsigned level,
1069                                          bool nofill)
1070 {
1071         struct bch_fs *c = trans->c;
1072         struct btree_cache *bc = &c->btree_cache;
1073         struct btree *b;
1074         struct bset_tree *t;
1075         int ret;
1076
1077         EBUG_ON(level >= BTREE_MAX_DEPTH);
1078
1079         if (c->opts.btree_node_mem_ptr_optimization) {
1080                 b = btree_node_mem_ptr(k);
1081                 if (b)
1082                         goto lock_node;
1083         }
1084 retry:
1085         b = btree_cache_find(bc, k);
1086         if (unlikely(!b)) {
1087                 if (nofill)
1088                         goto out;
1089
1090                 b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1091                                          level, SIX_LOCK_read, true);
1092
1093                 /* We raced and found the btree node in the cache */
1094                 if (!b)
1095                         goto retry;
1096
1097                 if (IS_ERR(b) &&
1098                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1099                         goto retry;
1100
1101                 if (IS_ERR(b))
1102                         goto out;
1103         } else {
1104 lock_node:
1105                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1106                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1107                         return ERR_PTR(ret);
1108
1109                 BUG_ON(ret);
1110
1111                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1112                              b->c.btree_id != btree_id ||
1113                              b->c.level != level)) {
1114                         six_unlock_read(&b->c.lock);
1115                         goto retry;
1116                 }
1117         }
1118
1119         /* XXX: waiting on IO with btree locks held: */
1120         __bch2_btree_node_wait_on_read(b);
1121
1122         prefetch(b->aux_data);
1123
1124         for_each_bset(b, t) {
1125                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1126
1127                 prefetch(p + L1_CACHE_BYTES * 0);
1128                 prefetch(p + L1_CACHE_BYTES * 1);
1129                 prefetch(p + L1_CACHE_BYTES * 2);
1130         }
1131
1132         /* avoid atomic set bit if it's not needed: */
1133         if (!btree_node_accessed(b))
1134                 set_btree_node_accessed(b);
1135
1136         if (unlikely(btree_node_read_error(b))) {
1137                 six_unlock_read(&b->c.lock);
1138                 b = ERR_PTR(-EIO);
1139                 goto out;
1140         }
1141
1142         EBUG_ON(b->c.btree_id != btree_id);
1143         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1144         btree_check_header(c, b);
1145 out:
1146         bch2_btree_cache_cannibalize_unlock(c);
1147         return b;
1148 }
1149
1150 int bch2_btree_node_prefetch(struct btree_trans *trans,
1151                              struct btree_path *path,
1152                              const struct bkey_i *k,
1153                              enum btree_id btree_id, unsigned level)
1154 {
1155         struct bch_fs *c = trans->c;
1156         struct btree_cache *bc = &c->btree_cache;
1157         struct btree *b;
1158
1159         BUG_ON(trans && !btree_node_locked(path, level + 1));
1160         BUG_ON(level >= BTREE_MAX_DEPTH);
1161
1162         b = btree_cache_find(bc, k);
1163         if (b)
1164                 return 0;
1165
1166         b = bch2_btree_node_fill(trans, path, k, btree_id,
1167                                  level, SIX_LOCK_read, false);
1168         return PTR_ERR_OR_ZERO(b);
1169 }
1170
1171 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1172 {
1173         struct bch_fs *c = trans->c;
1174         struct btree_cache *bc = &c->btree_cache;
1175         struct btree *b;
1176
1177         b = btree_cache_find(bc, k);
1178         if (!b)
1179                 return;
1180 wait_on_io:
1181         /* not allowed to wait on io with btree locks held: */
1182
1183         /* XXX we're called from btree_gc which will be holding other btree
1184          * nodes locked
1185          */
1186         __bch2_btree_node_wait_on_read(b);
1187         __bch2_btree_node_wait_on_write(b);
1188
1189         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1190         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1191
1192         if (btree_node_dirty(b)) {
1193                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1194                 six_unlock_write(&b->c.lock);
1195                 six_unlock_intent(&b->c.lock);
1196                 goto wait_on_io;
1197         }
1198
1199         BUG_ON(btree_node_dirty(b));
1200
1201         mutex_lock(&bc->lock);
1202         btree_node_data_free(c, b);
1203         bch2_btree_node_hash_remove(bc, b);
1204         mutex_unlock(&bc->lock);
1205
1206         six_unlock_write(&b->c.lock);
1207         six_unlock_intent(&b->c.lock);
1208 }
1209
1210 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1211                              const struct btree *b)
1212 {
1213         const struct bkey_format *f = &b->format;
1214         struct bset_stats stats;
1215
1216         memset(&stats, 0, sizeof(stats));
1217
1218         bch2_btree_keys_stats(b, &stats);
1219
1220         prt_printf(out, "l %u ", b->c.level);
1221         bch2_bpos_to_text(out, b->data->min_key);
1222         prt_printf(out, " - ");
1223         bch2_bpos_to_text(out, b->data->max_key);
1224         prt_printf(out, ":\n"
1225                "    ptrs: ");
1226         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1227
1228         prt_printf(out, "\n"
1229                "    format: u64s %u fields %u %u %u %u %u\n"
1230                "    unpack fn len: %u\n"
1231                "    bytes used %zu/%zu (%zu%% full)\n"
1232                "    sib u64s: %u, %u (merge threshold %u)\n"
1233                "    nr packed keys %u\n"
1234                "    nr unpacked keys %u\n"
1235                "    floats %zu\n"
1236                "    failed unpacked %zu\n",
1237                f->key_u64s,
1238                f->bits_per_field[0],
1239                f->bits_per_field[1],
1240                f->bits_per_field[2],
1241                f->bits_per_field[3],
1242                f->bits_per_field[4],
1243                b->unpack_fn_len,
1244                b->nr.live_u64s * sizeof(u64),
1245                btree_bytes(c) - sizeof(struct btree_node),
1246                b->nr.live_u64s * 100 / btree_max_u64s(c),
1247                b->sib_u64s[0],
1248                b->sib_u64s[1],
1249                c->btree_foreground_merge_threshold,
1250                b->nr.packed_keys,
1251                b->nr.unpacked_keys,
1252                stats.floats,
1253                stats.failed);
1254 }
1255
1256 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1257 {
1258         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1259         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1260         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1261
1262         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1263         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1264         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1265         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1266         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1267         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1268         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1269         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1270         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1271         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1272
1273 }