]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_cache.c
Update bcachefs sources to 31c09369cd six locks: Fix an unitialized var
[bcachefs-tools-debian] / libbcachefs / btree_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "debug.h"
10 #include "errcode.h"
11 #include "error.h"
12 #include "trace.h"
13
14 #include <linux/prefetch.h>
15 #include <linux/sched/mm.h>
16 #include <linux/seq_buf.h>
17
18 #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
19 do {                                             \
20         if (shrinker_counter)                    \
21                 bc->not_freed_##counter++;       \
22 } while (0)
23
24 const char * const bch2_btree_node_flags[] = {
25 #define x(f)    #f,
26         BTREE_FLAGS()
27 #undef x
28         NULL
29 };
30
31 void bch2_recalc_btree_reserve(struct bch_fs *c)
32 {
33         unsigned i, reserve = 16;
34
35         if (!c->btree_roots[0].b)
36                 reserve += 8;
37
38         for (i = 0; i < BTREE_ID_NR; i++)
39                 if (c->btree_roots[i].b)
40                         reserve += min_t(unsigned, 1,
41                                          c->btree_roots[i].b->c.level) * 8;
42
43         c->btree_cache.reserve = reserve;
44 }
45
46 static inline unsigned btree_cache_can_free(struct btree_cache *bc)
47 {
48         return max_t(int, 0, bc->used - bc->reserve);
49 }
50
51 static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
52 {
53         if (b->c.lock.readers)
54                 list_move(&b->list, &bc->freed_pcpu);
55         else
56                 list_move(&b->list, &bc->freed_nonpcpu);
57 }
58
59 static void btree_node_data_free(struct bch_fs *c, struct btree *b)
60 {
61         struct btree_cache *bc = &c->btree_cache;
62
63         EBUG_ON(btree_node_write_in_flight(b));
64
65         clear_btree_node_just_written(b);
66
67         kvpfree(b->data, btree_bytes(c));
68         b->data = NULL;
69 #ifdef __KERNEL__
70         kvfree(b->aux_data);
71 #else
72         munmap(b->aux_data, btree_aux_data_bytes(b));
73 #endif
74         b->aux_data = NULL;
75
76         bc->used--;
77
78         btree_node_to_freedlist(bc, b);
79 }
80
81 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
82                                    const void *obj)
83 {
84         const struct btree *b = obj;
85         const u64 *v = arg->key;
86
87         return b->hash_val == *v ? 0 : 1;
88 }
89
90 static const struct rhashtable_params bch_btree_cache_params = {
91         .head_offset    = offsetof(struct btree, hash),
92         .key_offset     = offsetof(struct btree, hash_val),
93         .key_len        = sizeof(u64),
94         .obj_cmpfn      = bch2_btree_cache_cmp_fn,
95 };
96
97 static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
98 {
99         BUG_ON(b->data || b->aux_data);
100
101         b->data = kvpmalloc(btree_bytes(c), gfp);
102         if (!b->data)
103                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
104 #ifdef __KERNEL__
105         b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
106 #else
107         b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
108                            PROT_READ|PROT_WRITE|PROT_EXEC,
109                            MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
110         if (b->aux_data == MAP_FAILED)
111                 b->aux_data = NULL;
112 #endif
113         if (!b->aux_data) {
114                 kvpfree(b->data, btree_bytes(c));
115                 b->data = NULL;
116                 return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
117         }
118
119         return 0;
120 }
121
122 static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
123 {
124         struct btree *b;
125
126         b = kzalloc(sizeof(struct btree), gfp);
127         if (!b)
128                 return NULL;
129
130         bkey_btree_ptr_init(&b->key);
131 #ifdef CONFIG_DEBUG_LOCK_ALLOC
132         lockdep_set_no_check_recursion(&b->c.lock.dep_map);
133 #endif
134         INIT_LIST_HEAD(&b->list);
135         INIT_LIST_HEAD(&b->write_blocked);
136         b->byte_order = ilog2(btree_bytes(c));
137         return b;
138 }
139
140 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
141 {
142         struct btree_cache *bc = &c->btree_cache;
143         struct btree *b;
144
145         b = __btree_node_mem_alloc(c, GFP_KERNEL);
146         if (!b)
147                 return NULL;
148
149         if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
150                 kfree(b);
151                 return NULL;
152         }
153
154         bch2_btree_lock_init(&b->c, 0);
155
156         bc->used++;
157         list_add(&b->list, &bc->freeable);
158         return b;
159 }
160
161 /* Btree in memory cache - hash table */
162
163 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
164 {
165         int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
166
167         BUG_ON(ret);
168
169         /* Cause future lookups for this node to fail: */
170         b->hash_val = 0;
171 }
172
173 int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
174 {
175         BUG_ON(b->hash_val);
176         b->hash_val = btree_ptr_hash_val(&b->key);
177
178         return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
179                                              bch_btree_cache_params);
180 }
181
182 int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
183                                 unsigned level, enum btree_id id)
184 {
185         int ret;
186
187         b->c.level      = level;
188         b->c.btree_id   = id;
189
190         mutex_lock(&bc->lock);
191         ret = __bch2_btree_node_hash_insert(bc, b);
192         if (!ret)
193                 list_add_tail(&b->list, &bc->live);
194         mutex_unlock(&bc->lock);
195
196         return ret;
197 }
198
199 __flatten
200 static inline struct btree *btree_cache_find(struct btree_cache *bc,
201                                      const struct bkey_i *k)
202 {
203         u64 v = btree_ptr_hash_val(k);
204
205         return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
206 }
207
208 /*
209  * this version is for btree nodes that have already been freed (we're not
210  * reaping a real btree node)
211  */
212 static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
213 {
214         struct btree_cache *bc = &c->btree_cache;
215         int ret = 0;
216
217         lockdep_assert_held(&bc->lock);
218 wait_on_io:
219         if (b->flags & ((1U << BTREE_NODE_dirty)|
220                         (1U << BTREE_NODE_read_in_flight)|
221                         (1U << BTREE_NODE_write_in_flight))) {
222                 if (!flush) {
223                         if (btree_node_dirty(b))
224                                 BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
225                         else if (btree_node_read_in_flight(b))
226                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
227                         else if (btree_node_write_in_flight(b))
228                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
229                         return -BCH_ERR_ENOMEM_btree_node_reclaim;
230                 }
231
232                 /* XXX: waiting on IO with btree cache lock held */
233                 bch2_btree_node_wait_on_read(b);
234                 bch2_btree_node_wait_on_write(b);
235         }
236
237         if (!six_trylock_intent(&b->c.lock)) {
238                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
239                 return -BCH_ERR_ENOMEM_btree_node_reclaim;
240         }
241
242         if (!six_trylock_write(&b->c.lock)) {
243                 BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
244                 goto out_unlock_intent;
245         }
246
247         /* recheck under lock */
248         if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
249                         (1U << BTREE_NODE_write_in_flight))) {
250                 if (!flush) {
251                         if (btree_node_read_in_flight(b))
252                                 BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
253                         else if (btree_node_write_in_flight(b))
254                                 BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
255                         goto out_unlock;
256                 }
257                 six_unlock_write(&b->c.lock);
258                 six_unlock_intent(&b->c.lock);
259                 goto wait_on_io;
260         }
261
262         if (btree_node_noevict(b)) {
263                 BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
264                 goto out_unlock;
265         }
266         if (btree_node_write_blocked(b)) {
267                 BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
268                 goto out_unlock;
269         }
270         if (btree_node_will_make_reachable(b)) {
271                 BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
272                 goto out_unlock;
273         }
274
275         if (btree_node_dirty(b)) {
276                 if (!flush) {
277                         BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
278                         goto out_unlock;
279                 }
280                 /*
281                  * Using the underscore version because we don't want to compact
282                  * bsets after the write, since this node is about to be evicted
283                  * - unless btree verify mode is enabled, since it runs out of
284                  * the post write cleanup:
285                  */
286                 if (bch2_verify_btree_ondisk)
287                         bch2_btree_node_write(c, b, SIX_LOCK_intent,
288                                               BTREE_WRITE_cache_reclaim);
289                 else
290                         __bch2_btree_node_write(c, b,
291                                                 BTREE_WRITE_cache_reclaim);
292
293                 six_unlock_write(&b->c.lock);
294                 six_unlock_intent(&b->c.lock);
295                 goto wait_on_io;
296         }
297 out:
298         if (b->hash_val && !ret)
299                 trace_and_count(c, btree_cache_reap, c, b);
300         return ret;
301 out_unlock:
302         six_unlock_write(&b->c.lock);
303 out_unlock_intent:
304         six_unlock_intent(&b->c.lock);
305         ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
306         goto out;
307 }
308
309 static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
310 {
311         return __btree_node_reclaim(c, b, false, shrinker_counter);
312 }
313
314 static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
315 {
316         return __btree_node_reclaim(c, b, true, false);
317 }
318
319 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
320                                            struct shrink_control *sc)
321 {
322         struct bch_fs *c = container_of(shrink, struct bch_fs,
323                                         btree_cache.shrink);
324         struct btree_cache *bc = &c->btree_cache;
325         struct btree *b, *t;
326         unsigned long nr = sc->nr_to_scan;
327         unsigned long can_free = 0;
328         unsigned long freed = 0;
329         unsigned long touched = 0;
330         unsigned i, flags;
331         unsigned long ret = SHRINK_STOP;
332         bool trigger_writes = atomic_read(&bc->dirty) + nr >=
333                 bc->used * 3 / 4;
334
335         if (bch2_btree_shrinker_disabled)
336                 return SHRINK_STOP;
337
338         mutex_lock(&bc->lock);
339         flags = memalloc_nofs_save();
340
341         /*
342          * It's _really_ critical that we don't free too many btree nodes - we
343          * have to always leave ourselves a reserve. The reserve is how we
344          * guarantee that allocating memory for a new btree node can always
345          * succeed, so that inserting keys into the btree can always succeed and
346          * IO can always make forward progress:
347          */
348         can_free = btree_cache_can_free(bc);
349         nr = min_t(unsigned long, nr, can_free);
350
351         i = 0;
352         list_for_each_entry_safe(b, t, &bc->freeable, list) {
353                 /*
354                  * Leave a few nodes on the freeable list, so that a btree split
355                  * won't have to hit the system allocator:
356                  */
357                 if (++i <= 3)
358                         continue;
359
360                 touched++;
361
362                 if (touched >= nr)
363                         goto out;
364
365                 if (!btree_node_reclaim(c, b, true)) {
366                         btree_node_data_free(c, b);
367                         six_unlock_write(&b->c.lock);
368                         six_unlock_intent(&b->c.lock);
369                         freed++;
370                         bc->freed++;
371                 }
372         }
373 restart:
374         list_for_each_entry_safe(b, t, &bc->live, list) {
375                 touched++;
376
377                 if (btree_node_accessed(b)) {
378                         clear_btree_node_accessed(b);
379                         bc->not_freed_access_bit++;
380                 } else if (!btree_node_reclaim(c, b, true)) {
381                         freed++;
382                         btree_node_data_free(c, b);
383                         bc->freed++;
384
385                         bch2_btree_node_hash_remove(bc, b);
386                         six_unlock_write(&b->c.lock);
387                         six_unlock_intent(&b->c.lock);
388
389                         if (freed == nr)
390                                 goto out_rotate;
391                 } else if (trigger_writes &&
392                            btree_node_dirty(b) &&
393                            !btree_node_will_make_reachable(b) &&
394                            !btree_node_write_blocked(b) &&
395                            six_trylock_read(&b->c.lock)) {
396                         list_move(&bc->live, &b->list);
397                         mutex_unlock(&bc->lock);
398                         __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
399                         six_unlock_read(&b->c.lock);
400                         if (touched >= nr)
401                                 goto out_nounlock;
402                         mutex_lock(&bc->lock);
403                         goto restart;
404                 }
405
406                 if (touched >= nr)
407                         break;
408         }
409 out_rotate:
410         if (&t->list != &bc->live)
411                 list_move_tail(&bc->live, &t->list);
412 out:
413         mutex_unlock(&bc->lock);
414 out_nounlock:
415         ret = freed;
416         memalloc_nofs_restore(flags);
417         trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
418         return ret;
419 }
420
421 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
422                                             struct shrink_control *sc)
423 {
424         struct bch_fs *c = container_of(shrink, struct bch_fs,
425                                         btree_cache.shrink);
426         struct btree_cache *bc = &c->btree_cache;
427
428         if (bch2_btree_shrinker_disabled)
429                 return 0;
430
431         return btree_cache_can_free(bc);
432 }
433
434 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
435 {
436         struct bch_fs *c = container_of(shrink, struct bch_fs,
437                                         btree_cache.shrink);
438         char *cbuf;
439         size_t buflen = seq_buf_get_buf(s, &cbuf);
440         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
441
442         bch2_btree_cache_to_text(&out, &c->btree_cache);
443         seq_buf_commit(s, out.pos);
444 }
445
446 void bch2_fs_btree_cache_exit(struct bch_fs *c)
447 {
448         struct btree_cache *bc = &c->btree_cache;
449         struct btree *b;
450         unsigned i, flags;
451
452         if (bc->shrink.list.next)
453                 unregister_shrinker(&bc->shrink);
454
455         /* vfree() can allocate memory: */
456         flags = memalloc_nofs_save();
457         mutex_lock(&bc->lock);
458
459         if (c->verify_data)
460                 list_move(&c->verify_data->list, &bc->live);
461
462         kvpfree(c->verify_ondisk, btree_bytes(c));
463
464         for (i = 0; i < BTREE_ID_NR; i++)
465                 if (c->btree_roots[i].b)
466                         list_add(&c->btree_roots[i].b->list, &bc->live);
467
468         list_splice(&bc->freeable, &bc->live);
469
470         while (!list_empty(&bc->live)) {
471                 b = list_first_entry(&bc->live, struct btree, list);
472
473                 BUG_ON(btree_node_read_in_flight(b) ||
474                        btree_node_write_in_flight(b));
475
476                 if (btree_node_dirty(b))
477                         bch2_btree_complete_write(c, b, btree_current_write(b));
478                 clear_btree_node_dirty_acct(c, b);
479
480                 btree_node_data_free(c, b);
481         }
482
483         BUG_ON(atomic_read(&c->btree_cache.dirty));
484
485         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
486
487         while (!list_empty(&bc->freed_nonpcpu)) {
488                 b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
489                 list_del(&b->list);
490                 six_lock_exit(&b->c.lock);
491                 kfree(b);
492         }
493
494         mutex_unlock(&bc->lock);
495         memalloc_nofs_restore(flags);
496
497         if (bc->table_init_done)
498                 rhashtable_destroy(&bc->table);
499 }
500
501 int bch2_fs_btree_cache_init(struct bch_fs *c)
502 {
503         struct btree_cache *bc = &c->btree_cache;
504         unsigned i;
505         int ret = 0;
506
507         pr_verbose_init(c->opts, "");
508
509         ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
510         if (ret)
511                 goto out;
512
513         bc->table_init_done = true;
514
515         bch2_recalc_btree_reserve(c);
516
517         for (i = 0; i < bc->reserve; i++)
518                 if (!__bch2_btree_node_mem_alloc(c)) {
519                         ret = -BCH_ERR_ENOMEM_fs_btree_cache_init;
520                         goto out;
521                 }
522
523         list_splice_init(&bc->live, &bc->freeable);
524
525         mutex_init(&c->verify_lock);
526
527         bc->shrink.count_objects        = bch2_btree_cache_count;
528         bc->shrink.scan_objects         = bch2_btree_cache_scan;
529         bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
530         bc->shrink.seeks                = 4;
531         ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
532 out:
533         pr_verbose_init(c->opts, "ret %i", ret);
534         return ret;
535 }
536
537 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
538 {
539         mutex_init(&bc->lock);
540         INIT_LIST_HEAD(&bc->live);
541         INIT_LIST_HEAD(&bc->freeable);
542         INIT_LIST_HEAD(&bc->freed_pcpu);
543         INIT_LIST_HEAD(&bc->freed_nonpcpu);
544 }
545
546 /*
547  * We can only have one thread cannibalizing other cached btree nodes at a time,
548  * or we'll deadlock. We use an open coded mutex to ensure that, which a
549  * cannibalize_bucket() will take. This means every time we unlock the root of
550  * the btree, we need to release this lock if we have it held.
551  */
552 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
553 {
554         struct btree_cache *bc = &c->btree_cache;
555
556         if (bc->alloc_lock == current) {
557                 trace_and_count(c, btree_cache_cannibalize_unlock, c);
558                 bc->alloc_lock = NULL;
559                 closure_wake_up(&bc->alloc_wait);
560         }
561 }
562
563 int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
564 {
565         struct btree_cache *bc = &c->btree_cache;
566         struct task_struct *old;
567
568         old = cmpxchg(&bc->alloc_lock, NULL, current);
569         if (old == NULL || old == current)
570                 goto success;
571
572         if (!cl) {
573                 trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
574                 return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
575         }
576
577         closure_wait(&bc->alloc_wait, cl);
578
579         /* Try again, after adding ourselves to waitlist */
580         old = cmpxchg(&bc->alloc_lock, NULL, current);
581         if (old == NULL || old == current) {
582                 /* We raced */
583                 closure_wake_up(&bc->alloc_wait);
584                 goto success;
585         }
586
587         trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
588         return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
589
590 success:
591         trace_and_count(c, btree_cache_cannibalize_lock, c);
592         return 0;
593 }
594
595 static struct btree *btree_node_cannibalize(struct bch_fs *c)
596 {
597         struct btree_cache *bc = &c->btree_cache;
598         struct btree *b;
599
600         list_for_each_entry_reverse(b, &bc->live, list)
601                 if (!btree_node_reclaim(c, b, false))
602                         return b;
603
604         while (1) {
605                 list_for_each_entry_reverse(b, &bc->live, list)
606                         if (!btree_node_write_and_reclaim(c, b))
607                                 return b;
608
609                 /*
610                  * Rare case: all nodes were intent-locked.
611                  * Just busy-wait.
612                  */
613                 WARN_ONCE(1, "btree cache cannibalize failed\n");
614                 cond_resched();
615         }
616 }
617
618 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
619 {
620         struct bch_fs *c = trans->c;
621         struct btree_cache *bc = &c->btree_cache;
622         struct list_head *freed = pcpu_read_locks
623                 ? &bc->freed_pcpu
624                 : &bc->freed_nonpcpu;
625         struct btree *b, *b2;
626         u64 start_time = local_clock();
627         unsigned flags;
628
629         flags = memalloc_nofs_save();
630         mutex_lock(&bc->lock);
631
632         /*
633          * We never free struct btree itself, just the memory that holds the on
634          * disk node. Check the freed list before allocating a new one:
635          */
636         list_for_each_entry(b, freed, list)
637                 if (!btree_node_reclaim(c, b, false)) {
638                         list_del_init(&b->list);
639                         goto got_node;
640                 }
641
642         b = __btree_node_mem_alloc(c, __GFP_NOWARN);
643         if (!b) {
644                 mutex_unlock(&bc->lock);
645                 b = __btree_node_mem_alloc(c, GFP_KERNEL);
646                 if (!b)
647                         goto err;
648                 mutex_lock(&bc->lock);
649         }
650
651         bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
652
653         BUG_ON(!six_trylock_intent(&b->c.lock));
654         BUG_ON(!six_trylock_write(&b->c.lock));
655 got_node:
656
657         /*
658          * btree_free() doesn't free memory; it sticks the node on the end of
659          * the list. Check if there's any freed nodes there:
660          */
661         list_for_each_entry(b2, &bc->freeable, list)
662                 if (!btree_node_reclaim(c, b2, false)) {
663                         swap(b->data, b2->data);
664                         swap(b->aux_data, b2->aux_data);
665                         btree_node_to_freedlist(bc, b2);
666                         six_unlock_write(&b2->c.lock);
667                         six_unlock_intent(&b2->c.lock);
668                         goto got_mem;
669                 }
670
671         mutex_unlock(&bc->lock);
672
673         if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
674                 goto err;
675
676         mutex_lock(&bc->lock);
677         bc->used++;
678 got_mem:
679         mutex_unlock(&bc->lock);
680
681         BUG_ON(btree_node_hashed(b));
682         BUG_ON(btree_node_dirty(b));
683         BUG_ON(btree_node_write_in_flight(b));
684 out:
685         b->flags                = 0;
686         b->written              = 0;
687         b->nsets                = 0;
688         b->sib_u64s[0]          = 0;
689         b->sib_u64s[1]          = 0;
690         b->whiteout_u64s        = 0;
691         bch2_btree_keys_init(b);
692         set_btree_node_accessed(b);
693
694         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
695                                start_time);
696
697         memalloc_nofs_restore(flags);
698         return b;
699 err:
700         mutex_lock(&bc->lock);
701
702         /* Try to cannibalize another cached btree node: */
703         if (bc->alloc_lock == current) {
704                 b2 = btree_node_cannibalize(c);
705                 clear_btree_node_just_written(b2);
706                 bch2_btree_node_hash_remove(bc, b2);
707
708                 if (b) {
709                         swap(b->data, b2->data);
710                         swap(b->aux_data, b2->aux_data);
711                         btree_node_to_freedlist(bc, b2);
712                         six_unlock_write(&b2->c.lock);
713                         six_unlock_intent(&b2->c.lock);
714                 } else {
715                         b = b2;
716                         list_del_init(&b->list);
717                 }
718
719                 mutex_unlock(&bc->lock);
720
721                 trace_and_count(c, btree_cache_cannibalize, c);
722                 goto out;
723         }
724
725         mutex_unlock(&bc->lock);
726         memalloc_nofs_restore(flags);
727         return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
728 }
729
730 /* Slowpath, don't want it inlined into btree_iter_traverse() */
731 static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
732                                 struct btree_path *path,
733                                 const struct bkey_i *k,
734                                 enum btree_id btree_id,
735                                 unsigned level,
736                                 enum six_lock_type lock_type,
737                                 bool sync)
738 {
739         struct bch_fs *c = trans->c;
740         struct btree_cache *bc = &c->btree_cache;
741         struct btree *b;
742         u32 seq;
743
744         BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
745         /*
746          * Parent node must be locked, else we could read in a btree node that's
747          * been freed:
748          */
749         if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
750                 trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
751                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
752         }
753
754         b = bch2_btree_node_mem_alloc(trans, level != 0);
755
756         if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
757                 trans->memory_allocation_failure = true;
758                 trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
759                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
760         }
761
762         if (IS_ERR(b))
763                 return b;
764
765         /*
766          * Btree nodes read in from disk should not have the accessed bit set
767          * initially, so that linear scans don't thrash the cache:
768          */
769         clear_btree_node_accessed(b);
770
771         bkey_copy(&b->key, k);
772         if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
773                 /* raced with another fill: */
774
775                 /* mark as unhashed... */
776                 b->hash_val = 0;
777
778                 mutex_lock(&bc->lock);
779                 list_add(&b->list, &bc->freeable);
780                 mutex_unlock(&bc->lock);
781
782                 six_unlock_write(&b->c.lock);
783                 six_unlock_intent(&b->c.lock);
784                 return NULL;
785         }
786
787         set_btree_node_read_in_flight(b);
788
789         six_unlock_write(&b->c.lock);
790         seq = six_lock_seq(&b->c.lock);
791         six_unlock_intent(&b->c.lock);
792
793         /* Unlock before doing IO: */
794         if (trans && sync)
795                 bch2_trans_unlock(trans);
796
797         bch2_btree_node_read(c, b, sync);
798
799         if (!sync)
800                 return NULL;
801
802         if (path) {
803                 int ret = bch2_trans_relock(trans) ?:
804                         bch2_btree_path_relock_intent(trans, path);
805                 if (ret) {
806                         BUG_ON(!trans->restarted);
807                         return ERR_PTR(ret);
808                 }
809         }
810
811         if (!six_relock_type(&b->c.lock, lock_type, seq)) {
812                 if (path)
813                         trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
814                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
815         }
816
817         return b;
818 }
819
820 static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
821 {
822         struct printbuf buf = PRINTBUF;
823
824         if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
825                 return;
826
827         prt_printf(&buf,
828                "btree node header doesn't match ptr\n"
829                "btree %s level %u\n"
830                "ptr: ",
831                bch2_btree_ids[b->c.btree_id], b->c.level);
832         bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
833
834         prt_printf(&buf, "\nheader: btree %s level %llu\n"
835                "min ",
836                bch2_btree_ids[BTREE_NODE_ID(b->data)],
837                BTREE_NODE_LEVEL(b->data));
838         bch2_bpos_to_text(&buf, b->data->min_key);
839
840         prt_printf(&buf, "\nmax ");
841         bch2_bpos_to_text(&buf, b->data->max_key);
842
843         bch2_fs_inconsistent(c, "%s", buf.buf);
844         printbuf_exit(&buf);
845 }
846
847 static inline void btree_check_header(struct bch_fs *c, struct btree *b)
848 {
849         if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
850             b->c.level != BTREE_NODE_LEVEL(b->data) ||
851             !bpos_eq(b->data->max_key, b->key.k.p) ||
852             (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
853              !bpos_eq(b->data->min_key,
854                       bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
855                 btree_bad_header(c, b);
856 }
857
858 static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
859                                            const struct bkey_i *k, unsigned level,
860                                            enum six_lock_type lock_type,
861                                            unsigned long trace_ip)
862 {
863         struct bch_fs *c = trans->c;
864         struct btree_cache *bc = &c->btree_cache;
865         struct btree *b;
866         struct bset_tree *t;
867         int ret;
868
869         EBUG_ON(level >= BTREE_MAX_DEPTH);
870 retry:
871         b = btree_cache_find(bc, k);
872         if (unlikely(!b)) {
873                 /*
874                  * We must have the parent locked to call bch2_btree_node_fill(),
875                  * else we could read in a btree node from disk that's been
876                  * freed:
877                  */
878                 b = bch2_btree_node_fill(trans, path, k, path->btree_id,
879                                          level, lock_type, true);
880
881                 /* We raced and found the btree node in the cache */
882                 if (!b)
883                         goto retry;
884
885                 if (IS_ERR(b))
886                         return b;
887         } else {
888                 if (btree_node_read_locked(path, level + 1))
889                         btree_node_unlock(trans, path, level + 1);
890
891                 ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
892                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
893                         return ERR_PTR(ret);
894
895                 BUG_ON(ret);
896
897                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
898                              b->c.level != level ||
899                              race_fault())) {
900                         six_unlock_type(&b->c.lock, lock_type);
901                         if (bch2_btree_node_relock(trans, path, level + 1))
902                                 goto retry;
903
904                         trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
905                         return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
906                 }
907
908                 /* avoid atomic set bit if it's not needed: */
909                 if (!btree_node_accessed(b))
910                         set_btree_node_accessed(b);
911         }
912
913         if (unlikely(btree_node_read_in_flight(b))) {
914                 u32 seq = six_lock_seq(&b->c.lock);
915
916                 six_unlock_type(&b->c.lock, lock_type);
917                 bch2_trans_unlock(trans);
918
919                 bch2_btree_node_wait_on_read(b);
920
921                 /*
922                  * should_be_locked is not set on this path yet, so we need to
923                  * relock it specifically:
924                  */
925                 if (trans) {
926                         int ret = bch2_trans_relock(trans) ?:
927                                 bch2_btree_path_relock_intent(trans, path);
928                         if (ret) {
929                                 BUG_ON(!trans->restarted);
930                                 return ERR_PTR(ret);
931                         }
932                 }
933
934                 if (!six_relock_type(&b->c.lock, lock_type, seq))
935                         goto retry;
936         }
937
938         prefetch(b->aux_data);
939
940         for_each_bset(b, t) {
941                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
942
943                 prefetch(p + L1_CACHE_BYTES * 0);
944                 prefetch(p + L1_CACHE_BYTES * 1);
945                 prefetch(p + L1_CACHE_BYTES * 2);
946         }
947
948         if (unlikely(btree_node_read_error(b))) {
949                 six_unlock_type(&b->c.lock, lock_type);
950                 return ERR_PTR(-EIO);
951         }
952
953         EBUG_ON(b->c.btree_id != path->btree_id);
954         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
955         btree_check_header(c, b);
956
957         return b;
958 }
959
960 /**
961  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
962  * in from disk if necessary.
963  *
964  * The btree node will have either a read or a write lock held, depending on
965  * the @write parameter.
966  */
967 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
968                                   const struct bkey_i *k, unsigned level,
969                                   enum six_lock_type lock_type,
970                                   unsigned long trace_ip)
971 {
972         struct bch_fs *c = trans->c;
973         struct btree *b;
974         struct bset_tree *t;
975         int ret;
976
977         EBUG_ON(level >= BTREE_MAX_DEPTH);
978
979         b = btree_node_mem_ptr(k);
980
981         /*
982          * Check b->hash_val _before_ calling btree_node_lock() - this might not
983          * be the node we want anymore, and trying to lock the wrong node could
984          * cause an unneccessary transaction restart:
985          */
986         if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
987                      !b ||
988                      b->hash_val != btree_ptr_hash_val(k)))
989                 return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
990
991         if (btree_node_read_locked(path, level + 1))
992                 btree_node_unlock(trans, path, level + 1);
993
994         ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
995         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
996                 return ERR_PTR(ret);
997
998         BUG_ON(ret);
999
1000         if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1001                      b->c.level != level ||
1002                      race_fault())) {
1003                 six_unlock_type(&b->c.lock, lock_type);
1004                 if (bch2_btree_node_relock(trans, path, level + 1))
1005                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1006
1007                 trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
1008                 return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
1009         }
1010
1011         if (unlikely(btree_node_read_in_flight(b))) {
1012                 u32 seq = six_lock_seq(&b->c.lock);
1013
1014                 six_unlock_type(&b->c.lock, lock_type);
1015                 bch2_trans_unlock(trans);
1016
1017                 bch2_btree_node_wait_on_read(b);
1018
1019                 /*
1020                  * should_be_locked is not set on this path yet, so we need to
1021                  * relock it specifically:
1022                  */
1023                 if (trans) {
1024                         int ret = bch2_trans_relock(trans) ?:
1025                                 bch2_btree_path_relock_intent(trans, path);
1026                         if (ret) {
1027                                 BUG_ON(!trans->restarted);
1028                                 return ERR_PTR(ret);
1029                         }
1030                 }
1031
1032                 if (!six_relock_type(&b->c.lock, lock_type, seq))
1033                         return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
1034         }
1035
1036         prefetch(b->aux_data);
1037
1038         for_each_bset(b, t) {
1039                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1040
1041                 prefetch(p + L1_CACHE_BYTES * 0);
1042                 prefetch(p + L1_CACHE_BYTES * 1);
1043                 prefetch(p + L1_CACHE_BYTES * 2);
1044         }
1045
1046         /* avoid atomic set bit if it's not needed: */
1047         if (!btree_node_accessed(b))
1048                 set_btree_node_accessed(b);
1049
1050         if (unlikely(btree_node_read_error(b))) {
1051                 six_unlock_type(&b->c.lock, lock_type);
1052                 return ERR_PTR(-EIO);
1053         }
1054
1055         EBUG_ON(b->c.btree_id != path->btree_id);
1056         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1057         btree_check_header(c, b);
1058
1059         return b;
1060 }
1061
1062 struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
1063                                          const struct bkey_i *k,
1064                                          enum btree_id btree_id,
1065                                          unsigned level,
1066                                          bool nofill)
1067 {
1068         struct bch_fs *c = trans->c;
1069         struct btree_cache *bc = &c->btree_cache;
1070         struct btree *b;
1071         struct bset_tree *t;
1072         int ret;
1073
1074         EBUG_ON(level >= BTREE_MAX_DEPTH);
1075
1076         if (c->opts.btree_node_mem_ptr_optimization) {
1077                 b = btree_node_mem_ptr(k);
1078                 if (b)
1079                         goto lock_node;
1080         }
1081 retry:
1082         b = btree_cache_find(bc, k);
1083         if (unlikely(!b)) {
1084                 if (nofill)
1085                         goto out;
1086
1087                 b = bch2_btree_node_fill(trans, NULL, k, btree_id,
1088                                          level, SIX_LOCK_read, true);
1089
1090                 /* We raced and found the btree node in the cache */
1091                 if (!b)
1092                         goto retry;
1093
1094                 if (IS_ERR(b) &&
1095                     !bch2_btree_cache_cannibalize_lock(c, NULL))
1096                         goto retry;
1097
1098                 if (IS_ERR(b))
1099                         goto out;
1100         } else {
1101 lock_node:
1102                 ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
1103                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1104                         return ERR_PTR(ret);
1105
1106                 BUG_ON(ret);
1107
1108                 if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
1109                              b->c.btree_id != btree_id ||
1110                              b->c.level != level)) {
1111                         six_unlock_read(&b->c.lock);
1112                         goto retry;
1113                 }
1114         }
1115
1116         /* XXX: waiting on IO with btree locks held: */
1117         __bch2_btree_node_wait_on_read(b);
1118
1119         prefetch(b->aux_data);
1120
1121         for_each_bset(b, t) {
1122                 void *p = (u64 *) b->aux_data + t->aux_data_offset;
1123
1124                 prefetch(p + L1_CACHE_BYTES * 0);
1125                 prefetch(p + L1_CACHE_BYTES * 1);
1126                 prefetch(p + L1_CACHE_BYTES * 2);
1127         }
1128
1129         /* avoid atomic set bit if it's not needed: */
1130         if (!btree_node_accessed(b))
1131                 set_btree_node_accessed(b);
1132
1133         if (unlikely(btree_node_read_error(b))) {
1134                 six_unlock_read(&b->c.lock);
1135                 b = ERR_PTR(-EIO);
1136                 goto out;
1137         }
1138
1139         EBUG_ON(b->c.btree_id != btree_id);
1140         EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
1141         btree_check_header(c, b);
1142 out:
1143         bch2_btree_cache_cannibalize_unlock(c);
1144         return b;
1145 }
1146
1147 int bch2_btree_node_prefetch(struct btree_trans *trans,
1148                              struct btree_path *path,
1149                              const struct bkey_i *k,
1150                              enum btree_id btree_id, unsigned level)
1151 {
1152         struct bch_fs *c = trans->c;
1153         struct btree_cache *bc = &c->btree_cache;
1154         struct btree *b;
1155
1156         BUG_ON(trans && !btree_node_locked(path, level + 1));
1157         BUG_ON(level >= BTREE_MAX_DEPTH);
1158
1159         b = btree_cache_find(bc, k);
1160         if (b)
1161                 return 0;
1162
1163         b = bch2_btree_node_fill(trans, path, k, btree_id,
1164                                  level, SIX_LOCK_read, false);
1165         return PTR_ERR_OR_ZERO(b);
1166 }
1167
1168 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
1169 {
1170         struct bch_fs *c = trans->c;
1171         struct btree_cache *bc = &c->btree_cache;
1172         struct btree *b;
1173
1174         b = btree_cache_find(bc, k);
1175         if (!b)
1176                 return;
1177 wait_on_io:
1178         /* not allowed to wait on io with btree locks held: */
1179
1180         /* XXX we're called from btree_gc which will be holding other btree
1181          * nodes locked
1182          */
1183         __bch2_btree_node_wait_on_read(b);
1184         __bch2_btree_node_wait_on_write(b);
1185
1186         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
1187         btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
1188
1189         if (btree_node_dirty(b)) {
1190                 __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
1191                 six_unlock_write(&b->c.lock);
1192                 six_unlock_intent(&b->c.lock);
1193                 goto wait_on_io;
1194         }
1195
1196         BUG_ON(btree_node_dirty(b));
1197
1198         mutex_lock(&bc->lock);
1199         btree_node_data_free(c, b);
1200         bch2_btree_node_hash_remove(bc, b);
1201         mutex_unlock(&bc->lock);
1202
1203         six_unlock_write(&b->c.lock);
1204         six_unlock_intent(&b->c.lock);
1205 }
1206
1207 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
1208                              const struct btree *b)
1209 {
1210         const struct bkey_format *f = &b->format;
1211         struct bset_stats stats;
1212
1213         memset(&stats, 0, sizeof(stats));
1214
1215         bch2_btree_keys_stats(b, &stats);
1216
1217         prt_printf(out, "l %u ", b->c.level);
1218         bch2_bpos_to_text(out, b->data->min_key);
1219         prt_printf(out, " - ");
1220         bch2_bpos_to_text(out, b->data->max_key);
1221         prt_printf(out, ":\n"
1222                "    ptrs: ");
1223         bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
1224
1225         prt_printf(out, "\n"
1226                "    format: u64s %u fields %u %u %u %u %u\n"
1227                "    unpack fn len: %u\n"
1228                "    bytes used %zu/%zu (%zu%% full)\n"
1229                "    sib u64s: %u, %u (merge threshold %u)\n"
1230                "    nr packed keys %u\n"
1231                "    nr unpacked keys %u\n"
1232                "    floats %zu\n"
1233                "    failed unpacked %zu\n",
1234                f->key_u64s,
1235                f->bits_per_field[0],
1236                f->bits_per_field[1],
1237                f->bits_per_field[2],
1238                f->bits_per_field[3],
1239                f->bits_per_field[4],
1240                b->unpack_fn_len,
1241                b->nr.live_u64s * sizeof(u64),
1242                btree_bytes(c) - sizeof(struct btree_node),
1243                b->nr.live_u64s * 100 / btree_max_u64s(c),
1244                b->sib_u64s[0],
1245                b->sib_u64s[1],
1246                c->btree_foreground_merge_threshold,
1247                b->nr.packed_keys,
1248                b->nr.unpacked_keys,
1249                stats.floats,
1250                stats.failed);
1251 }
1252
1253 void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
1254 {
1255         prt_printf(out, "nr nodes:\t\t%u\n", bc->used);
1256         prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty));
1257         prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
1258
1259         prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed);
1260         prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty);
1261         prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight);
1262         prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight);
1263         prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent);
1264         prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write);
1265         prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit);
1266         prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict);
1267         prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked);
1268         prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable);
1269
1270 }