]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
661b76674a18540440d26d6730992a5b7b1d7ee5
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12 #include "trace.h"
13
14 #include <linux/prefetch.h>
15 #include <linux/sched/mm.h>
16 #include <linux/seq_buf.h>
17
18 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
19 do {                                             \
20         if (shrinker_counter)                    \
21                 bc->not_freed_##counter++;       \
22 } while (0)
23
24 const char * const bch2_btree_node_flags[] = {
25 #define x(f)    #f,
26         BTREE_FLAGS()
27 #undef x
28         NULL
29 };
30
31 void bch2_recalc_btree_reserve(struct bch_fs *c)
32 {
33         unsigned i, reserve = 16;
34
35         if (!c->btree_roots[0].b)
36                 reserve += 8;
37
38         for (i = 0; i < BTREE_ID_NR; i++)
39                 if (c->btree_roots[i].b)
40                         reserve += min_t(unsigned, 1,
41                                          c->btree_roots[i].b->c.level) * 8;
42
43         c->btree_cache.reserve = reserve;
44 }
45
46 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
47 {
48         return max_t(int, 0, bc->used - bc->reserve);
49 }
50
51 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
52 {
53         if (b->c.lock.readers)
54                 list_move(&b->list, &bc->freed_pcpu);
55         else
56                 list_move(&b->list, &bc->freed_nonpcpu);
57 }
58
59 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
60 {
61         struct btree_cache *bc = &c->btree_cache;
62
63         EBUG_ON(btree_node_write_in_flight(b));
64
65         clear_btree_node_just_written(b);
66
67         kvpfree(b->data, btree_bytes(c));
68         b->data = NULL;
69 #ifdef __KERNEL__
70         kvfree(b->aux_data);
71 #else
72         munmap(b->aux_data, btree_aux_data_bytes(b));
73 #endif
74         b->aux_data = NULL;
75
76         bc->used--;
77
78         btree_node_to_freedlist(bc, b);
79 }
80
81 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
82                                    const void *obj)
83 {
84         const struct btree *b = obj;
85         const u64 *v = arg->key;
86
87         return b->hash_val == *v ? 0 : 1;
88 }
89
90 static const struct rhashtable_params bch_btree_cache_params = {
91         .head_offset    = offsetof(struct btree, hash),
92         .key_offset     = offsetof(struct btree, hash_val),
93         .key_len        = sizeof(u64),
94         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
95 };
96
97 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
98 {
99         BUG_ON(b->data || b->aux_data);
100
101         b->data = kvpmalloc(btree_bytes(c), gfp);
102         if (!b->data)
103                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
104 #ifdef __KERNEL__
105         b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
106 #else
107         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
108                            PROT_READ|PROT_WRITE|PROT_EXEC,
109                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
110         if (b->aux_data == MAP_FAILED)
111                 b->aux_data = NULL;
112 #endif
113         if (!b->aux_data) {
114                 kvpfree(b->data, btree_bytes(c));
115                 b->data = NULL;
116                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
117         }
118
119         return 0;
120 }
121
122 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
123 {
124         struct btree *b;
125
126         b = kzalloc(sizeof(struct btree), gfp);
127         if (!b)
128                 return NULL;
129
130         bkey_btree_ptr_init(&b->key);
131         INIT_LIST_HEAD(&b->list);
132         INIT_LIST_HEAD(&b->write_blocked);
133         b->byte_order = ilog2(btree_bytes(c));
134         return b;
135 }
136
137 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
138 {
139         struct btree_cache *bc = &c->btree_cache;
140         struct btree *b;
141
142         b = __btree_node_mem_alloc(c, GFP_KERNEL);
143         if (!b)
144                 return NULL;
145
146         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
147                 kfree(b);
148                 return NULL;
149         }
150
151         bch2_btree_lock_init(&b->c, 0);
152
153         bc->used++;
154         list_add(&b->list, &bc->freeable);
155         return b;
156 }
157
158 /* Btree in memory cache - hash table */
159
160 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
161 {
162         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
163
164         BUG_ON(ret);
165
166         /* Cause future lookups for this node to fail: */
167         b->hash_val = 0;
168 }
169
170 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
171 {
172         BUG_ON(b->hash_val);
173         b->hash_val = btree_ptr_hash_val(&b->key);
174
175         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
176                                              bch_btree_cache_params);
177 }
178
179 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
180                                 unsigned level, enum btree_id id)
181 {
182         int ret;
183
184         b->c.level      = level;
185         b->c.btree_id   = id;
186
187         mutex_lock(&bc->lock);
188         ret = __bch2_btree_node_hash_insert(bc, b);
189         if (!ret)
190                 list_add_tail(&b->list, &bc->live);
191         mutex_unlock(&bc->lock);
192
193         return ret;
194 }
195
196 __flatten
197 static inline struct btree *btree_cache_find(struct btree_cache *bc,
198                                      const struct bkey_i *k)
199 {
200         u64 v = btree_ptr_hash_val(k);
201
202         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
203 }
204
205 /*
206  * this version is for btree nodes that have already been freed (we're not
207  * reaping a real btree node)
208  */
209 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
210 {
211         struct btree_cache *bc = &c->btree_cache;
212         int ret = 0;
213
214         lockdep_assert_held(&bc->lock);
215 wait_on_io:
216         if (b->flags & ((1U << BTREE_NODE_dirty)|
217                         (1U << BTREE_NODE_read_in_flight)|
218                         (1U << BTREE_NODE_write_in_flight))) {
219                 if (!flush) {
220                         if (btree_node_dirty(b))
221                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
222                         else if (btree_node_read_in_flight(b))
223                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
224                         else if (btree_node_write_in_flight(b))
225                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
226                         return -BCH_ERR_ENOMEM_btree_node_reclaim;
227                 }
228
229                 /* XXX: waiting on IO with btree cache lock held */
230                 bch2_btree_node_wait_on_read(b);
231                 bch2_btree_node_wait_on_write(b);
232         }
233
234         if (!six_trylock_intent(&b->c.lock)) {
235                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
236                 return -BCH_ERR_ENOMEM_btree_node_reclaim;
237         }
238
239         if (!six_trylock_write(&b->c.lock)) {
240                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
241                 goto out_unlock_intent;
242         }
243
244         /* recheck under lock */
245         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
246                         (1U << BTREE_NODE_write_in_flight))) {
247                 if (!flush) {
248                         if (btree_node_read_in_flight(b))
249                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
250                         else if (btree_node_write_in_flight(b))
251                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
252                         goto out_unlock;
253                 }
254                 six_unlock_write(&b->c.lock);
255                 six_unlock_intent(&b->c.lock);
256                 goto wait_on_io;
257         }
258
259         if (btree_node_noevict(b)) {
260                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
261                 goto out_unlock;
262         }
263         if (btree_node_write_blocked(b)) {
264                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
265                 goto out_unlock;
266         }
267         if (btree_node_will_make_reachable(b)) {
268                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
269                 goto out_unlock;
270         }
271
272         if (btree_node_dirty(b)) {
273                 if (!flush) {
274                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
275                         goto out_unlock;
276                 }
277                 /*
278                  * Using the underscore version because we don't want to compact
279                  * bsets after the write, since this node is about to be evicted
280                  * - unless btree verify mode is enabled, since it runs out of
281                  * the post write cleanup:
282                  */
283                 if (bch2_verify_btree_ondisk)
284                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
285                                               BTREE_WRITE_cache_reclaim);
286                 else
287                         __bch2_btree_node_write(c, b,
288                                                 BTREE_WRITE_cache_reclaim);
289
290                 six_unlock_write(&b->c.lock);
291                 six_unlock_intent(&b->c.lock);
292                 goto wait_on_io;
293         }
294 out:
295         if (b->hash_val && !ret)
296                 trace_and_count(c, btree_cache_reap, c, b);
297         return ret;
298 out_unlock:
299         six_unlock_write(&b->c.lock);
300 out_unlock_intent:
301         six_unlock_intent(&b->c.lock);
302         ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
303         goto out;
304 }
305
306 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
307 {
308         return __btree_node_reclaim(c, b, false, shrinker_counter);
309 }
310
311 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
312 {
313         return __btree_node_reclaim(c, b, true, false);
314 }
315
316 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
317                                            struct shrink_control *sc)
318 {
319         struct bch_fs *c = container_of(shrink, struct bch_fs,
320                                         btree_cache.shrink);
321         struct btree_cache *bc = &c->btree_cache;
322         struct btree *b, *t;
323         unsigned long nr = sc->nr_to_scan;
324         unsigned long can_free = 0;
325         unsigned long freed = 0;
326         unsigned long touched = 0;
327         unsigned i, flags;
328         unsigned long ret = SHRINK_STOP;
329         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
330                 bc->used * 3 / 4;
331
332         if (bch2_btree_shrinker_disabled)
333                 return SHRINK_STOP;
334
335         mutex_lock(&bc->lock);
336         flags = memalloc_nofs_save();
337
338         /*
339          * It's _really_ critical that we don't free too many btree nodes - we
340          * have to always leave ourselves a reserve. The reserve is how we
341          * guarantee that allocating memory for a new btree node can always
342          * succeed, so that inserting keys into the btree can always succeed and
343          * IO can always make forward progress:
344          */
345         can_free = btree_cache_can_free(bc);
346         nr = min_t(unsigned long, nr, can_free);
347
348         i = 0;
349         list_for_each_entry_safe(b, t, &bc->freeable, list) {
350                 /*
351                  * Leave a few nodes on the freeable list, so that a btree split
352                  * won't have to hit the system allocator:
353                  */
354                 if (++i <= 3)
355                         continue;
356
357                 touched++;
358
359                 if (touched >= nr)
360                         goto out;
361
362                 if (!btree_node_reclaim(c, b, true)) {
363                         btree_node_data_free(c, b);
364                         six_unlock_write(&b->c.lock);
365                         six_unlock_intent(&b->c.lock);
366                         freed++;
367                         bc->freed++;
368                 }
369         }
370 restart:
371         list_for_each_entry_safe(b, t, &bc->live, list) {
372                 touched++;
373
374                 if (btree_node_accessed(b)) {
375                         clear_btree_node_accessed(b);
376                         bc->not_freed_access_bit++;
377                 } else if (!btree_node_reclaim(c, b, true)) {
378                         freed++;
379                         btree_node_data_free(c, b);
380                         bc->freed++;
381
382                         bch2_btree_node_hash_remove(bc, b);
383                         six_unlock_write(&b->c.lock);
384                         six_unlock_intent(&b->c.lock);
385
386                         if (freed == nr)
387                                 goto out_rotate;
388                 } else if (trigger_writes &&
389                            btree_node_dirty(b) &&
390                            !btree_node_will_make_reachable(b) &&
391                            !btree_node_write_blocked(b) &&
392                            six_trylock_read(&b->c.lock)) {
393                         list_move(&bc->live, &b->list);
394                         mutex_unlock(&bc->lock);
395                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
396                         six_unlock_read(&b->c.lock);
397                         if (touched >= nr)
398                                 goto out_nounlock;
399                         mutex_lock(&bc->lock);
400                         goto restart;
401                 }
402
403                 if (touched >= nr)
404                         break;
405         }
406 out_rotate:
407         if (&t->list != &bc->live)
408                 list_move_tail(&bc->live, &t->list);
409 out:
410         mutex_unlock(&bc->lock);
411 out_nounlock:
412         ret = freed;
413         memalloc_nofs_restore(flags);
414         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
415         return ret;
416 }
417
418 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
419                                             struct shrink_control *sc)
420 {
421         struct bch_fs *c = container_of(shrink, struct bch_fs,
422                                         btree_cache.shrink);
423         struct btree_cache *bc = &c->btree_cache;
424
425         if (bch2_btree_shrinker_disabled)
426                 return 0;
427
428         return btree_cache_can_free(bc);
429 }
430
431 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
432 {
433         struct bch_fs *c = container_of(shrink, struct bch_fs,
434                                         btree_cache.shrink);
435         char *cbuf;
436         size_t buflen = seq_buf_get_buf(s, &cbuf);
437         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
438
439         bch2_btree_cache_to_text(&out, &c->btree_cache);
440         seq_buf_commit(s, out.pos);
441 }
442
443 void bch2_fs_btree_cache_exit(struct bch_fs *c)
444 {
445         struct btree_cache *bc = &c->btree_cache;
446         struct btree *b;
447         unsigned i, flags;
448
449         if (bc->shrink.list.next)
450                 unregister_shrinker(&bc->shrink);
451
452         /* vfree() can allocate memory: */
453         flags = memalloc_nofs_save();
454         mutex_lock(&bc->lock);
455
456         if (c->verify_data)
457                 list_move(&c->verify_data->list, &bc->live);
458
459         kvpfree(c->verify_ondisk, btree_bytes(c));
460
461         for (i = 0; i < BTREE_ID_NR; i++)
462                 if (c->btree_roots[i].b)
463                         list_add(&c->btree_roots[i].b->list, &bc->live);
464
465         list_splice(&bc->freeable, &bc->live);
466
467         while (!list_empty(&bc->live)) {
468                 b = list_first_entry(&bc->live, struct btree, list);
469
470                 BUG_ON(btree_node_read_in_flight(b) ||
471                        btree_node_write_in_flight(b));
472
473                 if (btree_node_dirty(b))
474                         bch2_btree_complete_write(c, b, btree_current_write(b));
475                 clear_btree_node_dirty_acct(c, b);
476
477                 btree_node_data_free(c, b);
478         }
479
480         BUG_ON(atomic_read(&c->btree_cache.dirty));
481
482         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
483
484         while (!list_empty(&bc->freed_nonpcpu)) {
485                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
486                 list_del(&b->list);
487                 six_lock_exit(&b->c.lock);
488                 kfree(b);
489         }
490
491         mutex_unlock(&bc->lock);
492         memalloc_nofs_restore(flags);
493
494         if (bc->table_init_done)
495                 rhashtable_destroy(&bc->table);
496 }
497
498 int bch2_fs_btree_cache_init(struct bch_fs *c)
499 {
500         struct btree_cache *bc = &c->btree_cache;
501         unsigned i;
502         int ret = 0;
503
504         pr_verbose_init(c->opts, "");
505
506         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
507         if (ret)
508                 goto out;
509
510         bc->table_init_done = true;
511
512         bch2_recalc_btree_reserve(c);
513
514         for (i = 0; i < bc->reserve; i++)
515                 if (!__bch2_btree_node_mem_alloc(c)) {
516                         ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
517                         goto out;
518                 }
519
520         list_splice_init(&bc->live, &bc->freeable);
521
522         mutex_init(&c->verify_lock);
523
524         bc->shrink.count_objects        = bch2_btree_cache_count;
525         bc->shrink.scan_objects         = bch2_btree_cache_scan;
526         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
527         bc->shrink.seeks                = 4;
528         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
529 out:
530         pr_verbose_init(c->opts, "ret %i", ret);
531         return ret;
532 }
533
534 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
535 {
536         mutex_init(&bc->lock);
537         INIT_LIST_HEAD(&bc->live);
538         INIT_LIST_HEAD(&bc->freeable);
539         INIT_LIST_HEAD(&bc->freed_pcpu);
540         INIT_LIST_HEAD(&bc->freed_nonpcpu);
541 }
542
543 /*
544  * We can only have one thread cannibalizing other cached btree nodes at a time,
545  * or we'll deadlock. We use an open coded mutex to ensure that, which a
546  * cannibalize_bucket() will take. This means every time we unlock the root of
547  * the btree, we need to release this lock if we have it held.
548  */
549 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
550 {
551         struct btree_cache *bc = &c->btree_cache;
552
553         if (bc->alloc_lock == current) {
554                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
555                 bc->alloc_lock = NULL;
556                 closure_wake_up(&bc->alloc_wait);
557         }
558 }
559
560 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
561 {
562         struct btree_cache *bc = &c->btree_cache;
563         struct task_struct *old;
564
565         old = cmpxchg(&bc->alloc_lock, NULL, current);
566         if (old == NULL || old == current)
567                 goto success;
568
569         if (!cl) {
570                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
571                 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
572         }
573
574         closure_wait(&bc->alloc_wait, cl);
575
576         /* Try again, after adding ourselves to waitlist */
577         old = cmpxchg(&bc->alloc_lock, NULL, current);
578         if (old == NULL || old == current) {
579                 /* We raced */
580                 closure_wake_up(&bc->alloc_wait);
581                 goto success;
582         }
583
584         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
585         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
586
587 success:
588         trace_and_count(c, btree_cache_cannibalize_lock, c);
589         return 0;
590 }
591
592 static struct btree *btree_node_cannibalize(struct bch_fs *c)
593 {
594         struct btree_cache *bc = &c->btree_cache;
595         struct btree *b;
596
597         list_for_each_entry_reverse(b, &bc->live, list)
598                 if (!btree_node_reclaim(c, b, false))
599                         return b;
600
601         while (1) {
602                 list_for_each_entry_reverse(b, &bc->live, list)
603                         if (!btree_node_write_and_reclaim(c, b))
604                                 return b;
605
606                 /*
607                  * Rare case: all nodes were intent-locked.
608                  * Just busy-wait.
609                  */
610                 WARN_ONCE(1, "btree cache cannibalize failed\n");
611                 cond_resched();
612         }
613 }
614
615 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
616 {
617         struct bch_fs *c = trans->c;
618         struct btree_cache *bc = &c->btree_cache;
619         struct list_head *freed = pcpu_read_locks
620                 ? &bc->freed_pcpu
621                 : &bc->freed_nonpcpu;
622         struct btree *b, *b2;
623         u64 start_time = local_clock();
624         unsigned flags;
625
626         flags = memalloc_nofs_save();
627         mutex_lock(&bc->lock);
628
629         /*
630          * We never free struct btree itself, just the memory that holds the on
631          * disk node. Check the freed list before allocating a new one:
632          */
633         list_for_each_entry(b, freed, list)
634                 if (!btree_node_reclaim(c, b, false)) {
635                         list_del_init(&b->list);
636                         goto got_node;
637                 }
638
639         b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
640         if (!b) {
641                 mutex_unlock(&bc->lock);
642                 bch2_trans_unlock(trans);
643                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
644                 if (!b)
645                         goto err;
646                 mutex_lock(&bc->lock);
647         }
648
649         bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
650
651         BUG_ON(!six_trylock_intent(&b->c.lock));
652         BUG_ON(!six_trylock_write(&b->c.lock));
653 got_node:
654
655         /*
656          * btree_free() doesn't free memory; it sticks the node on the end of
657          * the list. Check if there's any freed nodes there:
658          */
659         list_for_each_entry(b2, &bc->freeable, list)
660                 if (!btree_node_reclaim(c, b2, false)) {
661                         swap(b->data, b2->data);
662                         swap(b->aux_data, b2->aux_data);
663                         btree_node_to_freedlist(bc, b2);
664                         six_unlock_write(&b2->c.lock);
665                         six_unlock_intent(&b2->c.lock);
666                         goto got_mem;
667                 }
668
669         mutex_unlock(&bc->lock);
670
671         if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
672                 bch2_trans_unlock(trans);
673                 if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
674                         goto err;
675         }
676
677         mutex_lock(&bc->lock);
678         bc->used++;
679 got_mem:
680         mutex_unlock(&bc->lock);
681
682         BUG_ON(btree_node_hashed(b));
683         BUG_ON(btree_node_dirty(b));
684         BUG_ON(btree_node_write_in_flight(b));
685 out:
686         b->flags                = 0;
687         b->written              = 0;
688         b->nsets                = 0;
689         b->sib_u64s[0]          = 0;
690         b->sib_u64s[1]          = 0;
691         b->whiteout_u64s        = 0;
692         bch2_btree_keys_init(b);
693         set_btree_node_accessed(b);
694
695         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
696                                start_time);
697
698         memalloc_nofs_restore(flags);
699         return b;
700 err:
701         mutex_lock(&bc->lock);
702
703         /* Try to cannibalize another cached btree node: */
704         if (bc->alloc_lock == current) {
705                 b2 = btree_node_cannibalize(c);
706                 clear_btree_node_just_written(b2);
707                 bch2_btree_node_hash_remove(bc, b2);
708
709                 if (b) {
710                         swap(b->data, b2->data);
711                         swap(b->aux_data, b2->aux_data);
712                         btree_node_to_freedlist(bc, b2);
713                         six_unlock_write(&b2->c.lock);
714                         six_unlock_intent(&b2->c.lock);
715                 } else {
716                         b = b2;
717                         list_del_init(&b->list);
718                 }
719
720                 mutex_unlock(&bc->lock);
721
722                 trace_and_count(c, btree_cache_cannibalize, c);
723                 goto out;
724         }
725
726         mutex_unlock(&bc->lock);
727         memalloc_nofs_restore(flags);
728         return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
729 }
730
731 /* Slowpath, don't want it inlined into btree_iter_traverse() */
732 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
733                                 struct btree_path *path,
734                                 const struct bkey_i *k,
735                                 enum btree_id btree_id,
736                                 unsigned level,
737                                 enum six_lock_type lock_type,
738                                 bool sync)
739 {
740         struct bch_fs *c = trans->c;
741         struct btree_cache *bc = &c->btree_cache;
742         struct btree *b;
743         u32 seq;
744
745         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
746         /*
747          * Parent node must be locked, else we could read in a btree node that's
748          * been freed:
749          */
750         if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
751                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
752                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
753         }
754
755         b = bch2_btree_node_mem_alloc(trans, level != 0);
756
757         if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
758                 trans->memory_allocation_failure = true;
759                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
760                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
761         }
762
763         if (IS_ERR(b))
764                 return b;
765
766         /*
767          * Btree nodes read in from disk should not have the accessed bit set
768          * initially, so that linear scans don't thrash the cache:
769          */
770         clear_btree_node_accessed(b);
771
772         bkey_copy(&b->key, k);
773         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
774                 /* raced with another fill: */
775
776                 /* mark as unhashed... */
777                 b->hash_val = 0;
778
779                 mutex_lock(&bc->lock);
780                 list_add(&b->list, &bc->freeable);
781                 mutex_unlock(&bc->lock);
782
783                 six_unlock_write(&b->c.lock);
784                 six_unlock_intent(&b->c.lock);
785                 return NULL;
786         }
787
788         set_btree_node_read_in_flight(b);
789
790         six_unlock_write(&b->c.lock);
791         seq = six_lock_seq(&b->c.lock);
792         six_unlock_intent(&b->c.lock);
793
794         /* Unlock before doing IO: */
795         if (trans && sync)
796                 bch2_trans_unlock(trans);
797
798         bch2_btree_node_read(c, b, sync);
799
800         if (!sync)
801                 return NULL;
802
803         if (path) {
804                 int ret = bch2_trans_relock(trans) ?:
805                         bch2_btree_path_relock_intent(trans, path);
806                 if (ret) {
807                         BUG_ON(!trans->restarted);
808                         return ERR_PTR(ret);
809                 }
810         }
811
812         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
813                 if (path)
814                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
815                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
816         }
817
818         return b;
819 }
820
821 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
822 {
823         struct printbuf buf = PRINTBUF;
824
825         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
826                 return;
827
828         prt_printf(&buf,
829                "btree node header doesn't match ptr\n"
830                "btree %s level %u\n"
831                "ptr: ",
832                bch2_btree_ids[b->c.btree_id], b->c.level);
833         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
834
835         prt_printf(&buf, "\nheader: btree %s level %llu\n"
836                "min ",
837                bch2_btree_ids[BTREE_NODE_ID(b->data)],
838                BTREE_NODE_LEVEL(b->data));
839         bch2_bpos_to_text(&buf, b->data->min_key);
840
841         prt_printf(&buf, "\nmax ");
842         bch2_bpos_to_text(&buf, b->data->max_key);
843
844         bch2_fs_inconsistent(c, "%s", buf.buf);
845         printbuf_exit(&buf);
846 }
847
848 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
849 {
850         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
851             b->c.level != BTREE_NODE_LEVEL(b->data) ||
852             !bpos_eq(b->data->max_key, b->key.k.p) ||
853             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
854              !bpos_eq(b->data->min_key,
855                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
856                 btree_bad_header(c, b);
857 }
858
859 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
860                                            const struct bkey_i *k, unsigned level,
861                                            enum six_lock_type lock_type,
862                                            unsigned long trace_ip)
863 {
864         struct bch_fs *c = trans->c;
865         struct btree_cache *bc = &c->btree_cache;
866         struct btree *b;
867         struct bset_tree *t;
868         bool need_relock = false;
869         int ret;
870
871         EBUG_ON(level >= BTREE_MAX_DEPTH);
872 retry:
873         b = btree_cache_find(bc, k);
874         if (unlikely(!b)) {
875                 /*
876                  * We must have the parent locked to call bch2_btree_node_fill(),
877                  * else we could read in a btree node from disk that's been
878                  * freed:
879                  */
880                 b = bch2_btree_node_fill(trans, path, k, path->btree_id,
881                                          level, lock_type, true);
882                 need_relock = true;
883
884                 /* We raced and found the btree node in the cache */
885                 if (!b)
886                         goto retry;
887
888                 if (IS_ERR(b))
889                         return b;
890         } else {
891                 if (btree_node_read_locked(path, level + 1))
892                         btree_node_unlock(trans, path, level + 1);
893
894                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
895                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
896                         return ERR_PTR(ret);
897
898                 BUG_ON(ret);
899
900                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
901                              b->c.level != level ||
902                              race_fault())) {
903                         six_unlock_type(&b->c.lock, lock_type);
904                         if (bch2_btree_node_relock(trans, path, level + 1))
905                                 goto retry;
906
907                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
908                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
909                 }
910
911                 /* avoid atomic set bit if it's not needed: */
912                 if (!btree_node_accessed(b))
913                         set_btree_node_accessed(b);
914         }
915
916         if (unlikely(btree_node_read_in_flight(b))) {
917                 u32 seq = six_lock_seq(&b->c.lock);
918
919                 six_unlock_type(&b->c.lock, lock_type);
920                 bch2_trans_unlock(trans);
921                 need_relock = true;
922
923                 bch2_btree_node_wait_on_read(b);
924
925                 /*
926                  * should_be_locked is not set on this path yet, so we need to
927                  * relock it specifically:
928                  */
929                 if (!six_relock_type(&b->c.lock, lock_type, seq))
930                         goto retry;
931         }
932
933         if (unlikely(need_relock)) {
934                 int ret = bch2_trans_relock(trans) ?:
935                         bch2_btree_path_relock_intent(trans, path);
936                 if (ret) {
937                         six_unlock_type(&b->c.lock, lock_type);
938                         return ERR_PTR(ret);
939                 }
940         }
941
942         prefetch(b->aux_data);
943
944         for_each_bset(b, t) {
945                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
946
947                 prefetch(p + L1_CACHE_BYTES * 0);
948                 prefetch(p + L1_CACHE_BYTES * 1);
949                 prefetch(p + L1_CACHE_BYTES * 2);
950         }
951
952         if (unlikely(btree_node_read_error(b))) {
953                 six_unlock_type(&b->c.lock, lock_type);
954                 return ERR_PTR(-EIO);
955         }
956
957         EBUG_ON(b->c.btree_id != path->btree_id);
958         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
959         btree_check_header(c, b);
960
961         return b;
962 }
963
964 /**
965  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
966  * in from disk if necessary.
967  *
968  * The btree node will have either a read or a write lock held, depending on
969  * the @write parameter.
970  */
971 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
972                                   const struct bkey_i *k, unsigned level,
973                                   enum six_lock_type lock_type,
974                                   unsigned long trace_ip)
975 {
976         struct bch_fs *c = trans->c;
977         struct btree *b;
978         struct bset_tree *t;
979         int ret;
980
981         EBUG_ON(level >= BTREE_MAX_DEPTH);
982
983         b = btree_node_mem_ptr(k);
984
985         /*
986          * Check b->hash_val _before_ calling btree_node_lock() - this might not
987          * be the node we want anymore, and trying to lock the wrong node could
988          * cause an unneccessary transaction restart:
989          */
990         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
991                      !b ||
992                      b->hash_val != btree_ptr_hash_val(k)))
993                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
994
995         if (btree_node_read_locked(path, level + 1))
996                 btree_node_unlock(trans, path, level + 1);
997
998         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
999         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1000                 return ERR_PTR(ret);
1001
1002         BUG_ON(ret);
1003
1004         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1005                      b->c.level != level ||
1006                      race_fault())) {
1007                 six_unlock_type(&b->c.lock, lock_type);
1008                 if (bch2_btree_node_relock(trans, path, level + 1))
1009                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1010
1011                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1012                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1013         }
1014
1015         if (unlikely(btree_node_read_in_flight(b))) {
1016                 u32 seq = six_lock_seq(&b->c.lock);
1017
1018                 six_unlock_type(&b->c.lock, lock_type);
1019                 bch2_trans_unlock(trans);
1020
1021                 bch2_btree_node_wait_on_read(b);
1022
1023                 /*
1024                  * should_be_locked is not set on this path yet, so we need to
1025                  * relock it specifically:
1026                  */
1027                 if (trans) {
1028                         int ret = bch2_trans_relock(trans) ?:
1029                                 bch2_btree_path_relock_intent(trans, path);
1030                         if (ret) {
1031                                 BUG_ON(!trans->restarted);
1032                                 return ERR_PTR(ret);
1033                         }
1034                 }
1035
1036                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1037                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1038         }
1039
1040         prefetch(b->aux_data);
1041
1042         for_each_bset(b, t) {
1043                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1044
1045                 prefetch(p + L1_CACHE_BYTES * 0);
1046                 prefetch(p + L1_CACHE_BYTES * 1);
1047                 prefetch(p + L1_CACHE_BYTES * 2);
1048         }
1049
1050         /* avoid atomic set bit if it's not needed: */
1051         if (!btree_node_accessed(b))
1052                 set_btree_node_accessed(b);
1053
1054         if (unlikely(btree_node_read_error(b))) {
1055                 six_unlock_type(&b->c.lock, lock_type);
1056                 return ERR_PTR(-EIO);
1057         }
1058
1059         EBUG_ON(b->c.btree_id != path->btree_id);
1060         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1061         btree_check_header(c, b);
1062
1063         return b;
1064 }
1065
1066 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1067                                          const struct bkey_i *k,
1068                                          enum btree_id btree_id,
1069                                          unsigned level,
1070                                          bool nofill)
1071 {
1072         struct bch_fs *c = trans->c;
1073         struct btree_cache *bc = &c->btree_cache;
1074         struct btree *b;
1075         struct bset_tree *t;
1076         int ret;
1077
1078         EBUG_ON(level >= BTREE_MAX_DEPTH);
1079
1080         if (c->opts.btree_node_mem_ptr_optimization) {
1081                 b = btree_node_mem_ptr(k);
1082                 if (b)
1083                         goto lock_node;
1084         }
1085 retry:
1086         b = btree_cache_find(bc, k);
1087         if (unlikely(!b)) {
1088                 if (nofill)
1089                         goto out;
1090
1091                 b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1092                                          level, SIX_LOCK_read, true);
1093
1094                 /* We raced and found the btree node in the cache */
1095                 if (!b)
1096                         goto retry;
1097
1098                 if (IS_ERR(b) &&
1099                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1100                         goto retry;
1101
1102                 if (IS_ERR(b))
1103                         goto out;
1104         } else {
1105 lock_node:
1106                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1107                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1108                         return ERR_PTR(ret);
1109
1110                 BUG_ON(ret);
1111
1112                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1113                              b->c.btree_id != btree_id ||
1114                              b->c.level != level)) {
1115                         six_unlock_read(&b->c.lock);
1116                         goto retry;
1117                 }
1118         }
1119
1120         /* XXX: waiting on IO with btree locks held: */
1121         __bch2_btree_node_wait_on_read(b);
1122
1123         prefetch(b->aux_data);
1124
1125         for_each_bset(b, t) {
1126                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1127
1128                 prefetch(p + L1_CACHE_BYTES * 0);
1129                 prefetch(p + L1_CACHE_BYTES * 1);
1130                 prefetch(p + L1_CACHE_BYTES * 2);
1131         }
1132
1133         /* avoid atomic set bit if it's not needed: */
1134         if (!btree_node_accessed(b))
1135                 set_btree_node_accessed(b);
1136
1137         if (unlikely(btree_node_read_error(b))) {
1138                 six_unlock_read(&b->c.lock);
1139                 b = ERR_PTR(-EIO);
1140                 goto out;
1141         }
1142
1143         EBUG_ON(b->c.btree_id != btree_id);
1144         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1145         btree_check_header(c, b);
1146 out:
1147         bch2_btree_cache_cannibalize_unlock(c);
1148         return b;
1149 }
1150
1151 int bch2_btree_node_prefetch(struct btree_trans *trans,
1152                              struct btree_path *path,
1153                              const struct bkey_i *k,
1154                              enum btree_id btree_id, unsigned level)
1155 {
1156         struct bch_fs *c = trans->c;
1157         struct btree_cache *bc = &c->btree_cache;
1158         struct btree *b;
1159
1160         BUG_ON(trans && !btree_node_locked(path, level + 1));
1161         BUG_ON(level >= BTREE_MAX_DEPTH);
1162
1163         b = btree_cache_find(bc, k);
1164         if (b)
1165                 return 0;
1166
1167         b = bch2_btree_node_fill(trans, path, k, btree_id,
1168                                  level, SIX_LOCK_read, false);
1169         return PTR_ERR_OR_ZERO(b);
1170 }
1171
1172 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1173 {
1174         struct bch_fs *c = trans->c;
1175         struct btree_cache *bc = &c->btree_cache;
1176         struct btree *b;
1177
1178         b = btree_cache_find(bc, k);
1179         if (!b)
1180                 return;
1181 wait_on_io:
1182         /* not allowed to wait on io with btree locks held: */
1183
1184         /* XXX we're called from btree_gc which will be holding other btree
1185          * nodes locked
1186          */
1187         __bch2_btree_node_wait_on_read(b);
1188         __bch2_btree_node_wait_on_write(b);
1189
1190         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1191         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1192
1193         if (btree_node_dirty(b)) {
1194                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1195                 six_unlock_write(&b->c.lock);
1196                 six_unlock_intent(&b->c.lock);
1197                 goto wait_on_io;
1198         }
1199
1200         BUG_ON(btree_node_dirty(b));
1201
1202         mutex_lock(&bc->lock);
1203         btree_node_data_free(c, b);
1204         bch2_btree_node_hash_remove(bc, b);
1205         mutex_unlock(&bc->lock);
1206
1207         six_unlock_write(&b->c.lock);
1208         six_unlock_intent(&b->c.lock);
1209 }
1210
1211 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1212                              const struct btree *b)
1213 {
1214         const struct bkey_format *f = &b->format;
1215         struct bset_stats stats;
1216
1217         memset(&stats, 0, sizeof(stats));
1218
1219         bch2_btree_keys_stats(b, &stats);
1220
1221         prt_printf(out, "l %u ", b->c.level);
1222         bch2_bpos_to_text(out, b->data->min_key);
1223         prt_printf(out, " - ");
1224         bch2_bpos_to_text(out, b->data->max_key);
1225         prt_printf(out, ":\n"
1226                "    ptrs: ");
1227         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1228
1229         prt_printf(out, "\n"
1230                "    format: u64s %u fields %u %u %u %u %u\n"
1231                "    unpack fn len: %u\n"
1232                "    bytes used %zu/%zu (%zu%% full)\n"
1233                "    sib u64s: %u, %u (merge threshold %u)\n"
1234                "    nr packed keys %u\n"
1235                "    nr unpacked keys %u\n"
1236                "    floats %zu\n"
1237                "    failed unpacked %zu\n",
1238                f->key_u64s,
1239                f->bits_per_field[0],
1240                f->bits_per_field[1],
1241                f->bits_per_field[2],
1242                f->bits_per_field[3],
1243                f->bits_per_field[4],
1244                b->unpack_fn_len,
1245                b->nr.live_u64s * sizeof(u64),
1246                btree_bytes(c) - sizeof(struct btree_node),
1247                b->nr.live_u64s * 100 / btree_max_u64s(c),
1248                b->sib_u64s[0],
1249                b->sib_u64s[1],
1250                c->btree_foreground_merge_threshold,
1251                b->nr.packed_keys,
1252                b->nr.unpacked_keys,
1253                stats.floats,
1254                stats.failed);
1255 }
1256
1257 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1258 {
1259         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1260         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1261         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1262
1263         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1264         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1265         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1266         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1267         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1268         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1269         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1270         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1271         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1272         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1273
1274 }