]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_key_cache.c
New upstream release
[bcachefs-tools-debian] / libbcachefs / btree_key_cache.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_cache.h"
5 #include "btree_iter.h"
6 #include "btree_key_cache.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "errcode.h"
10 #include "error.h"
11 #include "journal.h"
12 #include "journal_reclaim.h"
13
14 #include <linux/sched/mm.h>
15 #include <trace/events/bcachefs.h>
16
17 static inline bool btree_uses_pcpu_readers(enum btree_id id)
18 {
19         return id == BTREE_ID_subvolumes;
20 }
21
22 static struct kmem_cache *bch2_key_cache;
23
24 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
25                                        const void *obj)
26 {
27         const struct bkey_cached *ck = obj;
28         const struct bkey_cached_key *key = arg->key;
29
30         return cmp_int(ck->key.btree_id, key->btree_id) ?:
31                 bpos_cmp(ck->key.pos, key->pos);
32 }
33
34 static const struct rhashtable_params bch2_btree_key_cache_params = {
35         .head_offset    = offsetof(struct bkey_cached, hash),
36         .key_offset     = offsetof(struct bkey_cached, key),
37         .key_len        = sizeof(struct bkey_cached_key),
38         .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
39 };
40
41 __flatten
42 inline struct bkey_cached *
43 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
44 {
45         struct bkey_cached_key key = {
46                 .btree_id       = btree_id,
47                 .pos            = pos,
48         };
49
50         return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
51                                       bch2_btree_key_cache_params);
52 }
53
54 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
55 {
56         if (!six_trylock_intent(&ck->c.lock))
57                 return false;
58
59         if (!six_trylock_write(&ck->c.lock)) {
60                 six_unlock_intent(&ck->c.lock);
61                 return false;
62         }
63
64         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
65                 six_unlock_write(&ck->c.lock);
66                 six_unlock_intent(&ck->c.lock);
67                 return false;
68         }
69
70         return true;
71 }
72
73 static void bkey_cached_evict(struct btree_key_cache *c,
74                               struct bkey_cached *ck)
75 {
76         BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
77                                       bch2_btree_key_cache_params));
78         memset(&ck->key, ~0, sizeof(ck->key));
79
80         atomic_long_dec(&c->nr_keys);
81 }
82
83 static void bkey_cached_free(struct btree_key_cache *bc,
84                              struct bkey_cached *ck)
85 {
86         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
87
88         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
89
90         ck->btree_trans_barrier_seq =
91                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
92
93         if (ck->c.lock.readers)
94                 list_move_tail(&ck->list, &bc->freed_pcpu);
95         else
96                 list_move_tail(&ck->list, &bc->freed_nonpcpu);
97         atomic_long_inc(&bc->nr_freed);
98
99         kfree(ck->k);
100         ck->k           = NULL;
101         ck->u64s        = 0;
102
103         six_unlock_write(&ck->c.lock);
104         six_unlock_intent(&ck->c.lock);
105 }
106
107 #ifdef __KERNEL__
108 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
109                                                    struct bkey_cached *ck)
110 {
111         struct bkey_cached *pos;
112
113         list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
114                 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
115                                  pos->btree_trans_barrier_seq)) {
116                         list_move(&ck->list, &pos->list);
117                         return;
118                 }
119         }
120
121         list_move(&ck->list, &bc->freed_nonpcpu);
122 }
123 #endif
124
125 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
126                                          struct bkey_cached *ck)
127 {
128         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
129
130         if (!ck->c.lock.readers) {
131 #ifdef __KERNEL__
132                 struct btree_key_cache_freelist *f;
133                 bool freed = false;
134
135                 preempt_disable();
136                 f = this_cpu_ptr(bc->pcpu_freed);
137
138                 if (f->nr < ARRAY_SIZE(f->objs)) {
139                         f->objs[f->nr++] = ck;
140                         freed = true;
141                 }
142                 preempt_enable();
143
144                 if (!freed) {
145                         mutex_lock(&bc->lock);
146                         preempt_disable();
147                         f = this_cpu_ptr(bc->pcpu_freed);
148
149                         while (f->nr > ARRAY_SIZE(f->objs) / 2) {
150                                 struct bkey_cached *ck2 = f->objs[--f->nr];
151
152                                 __bkey_cached_move_to_freelist_ordered(bc, ck2);
153                         }
154                         preempt_enable();
155
156                         __bkey_cached_move_to_freelist_ordered(bc, ck);
157                         mutex_unlock(&bc->lock);
158                 }
159 #else
160                 mutex_lock(&bc->lock);
161                 list_move_tail(&ck->list, &bc->freed_nonpcpu);
162                 mutex_unlock(&bc->lock);
163 #endif
164         } else {
165                 mutex_lock(&bc->lock);
166                 list_move_tail(&ck->list, &bc->freed_pcpu);
167                 mutex_unlock(&bc->lock);
168         }
169 }
170
171 static void bkey_cached_free_fast(struct btree_key_cache *bc,
172                                   struct bkey_cached *ck)
173 {
174         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
175
176         ck->btree_trans_barrier_seq =
177                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
178
179         list_del_init(&ck->list);
180         atomic_long_inc(&bc->nr_freed);
181
182         kfree(ck->k);
183         ck->k           = NULL;
184         ck->u64s        = 0;
185
186         bkey_cached_move_to_freelist(bc, ck);
187
188         six_unlock_write(&ck->c.lock);
189         six_unlock_intent(&ck->c.lock);
190 }
191
192 static struct bkey_cached *
193 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
194                   bool *was_new)
195 {
196         struct bch_fs *c = trans->c;
197         struct btree_key_cache *bc = &c->btree_key_cache;
198         struct bkey_cached *ck = NULL;
199         bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
200
201         if (!pcpu_readers) {
202 #ifdef __KERNEL__
203                 struct btree_key_cache_freelist *f;
204
205                 preempt_disable();
206                 f = this_cpu_ptr(bc->pcpu_freed);
207                 if (f->nr)
208                         ck = f->objs[--f->nr];
209                 preempt_enable();
210
211                 if (!ck) {
212                         mutex_lock(&bc->lock);
213                         preempt_disable();
214                         f = this_cpu_ptr(bc->pcpu_freed);
215
216                         while (!list_empty(&bc->freed_nonpcpu) &&
217                                f->nr < ARRAY_SIZE(f->objs) / 2) {
218                                 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
219                                 list_del_init(&ck->list);
220                                 f->objs[f->nr++] = ck;
221                         }
222
223                         ck = f->nr ? f->objs[--f->nr] : NULL;
224                         preempt_enable();
225                         mutex_unlock(&bc->lock);
226                 }
227 #else
228                 mutex_lock(&bc->lock);
229                 if (!list_empty(&bc->freed_nonpcpu)) {
230                         ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
231                         list_del_init(&ck->list);
232                 }
233                 mutex_unlock(&bc->lock);
234 #endif
235         } else {
236                 mutex_lock(&bc->lock);
237                 if (!list_empty(&bc->freed_pcpu)) {
238                         ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
239                         list_del_init(&ck->list);
240                 }
241                 mutex_unlock(&bc->lock);
242         }
243
244         if (ck) {
245                 int ret;
246
247                 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent);
248                 if (unlikely(ret)) {
249                         bkey_cached_move_to_freelist(bc, ck);
250                         return ERR_PTR(ret);
251                 }
252
253                 path->l[0].b = (void *) ck;
254                 path->l[0].lock_seq = ck->c.lock.state.seq;
255                 mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
256
257                 ret = bch2_btree_node_lock_write(trans, path, &ck->c);
258                 if (unlikely(ret)) {
259                         btree_node_unlock(trans, path, 0);
260                         bkey_cached_move_to_freelist(bc, ck);
261                         return ERR_PTR(ret);
262                 }
263
264                 return ck;
265         }
266
267         /* GFP_NOFS because we're holding btree locks: */
268         ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
269         if (likely(ck)) {
270                 INIT_LIST_HEAD(&ck->list);
271                 __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key);
272                 if (pcpu_readers)
273                         six_lock_pcpu_alloc(&ck->c.lock);
274
275                 ck->c.cached = true;
276                 BUG_ON(!six_trylock_intent(&ck->c.lock));
277                 BUG_ON(!six_trylock_write(&ck->c.lock));
278                 *was_new = true;
279                 return ck;
280         }
281
282         return NULL;
283 }
284
285 static struct bkey_cached *
286 bkey_cached_reuse(struct btree_key_cache *c)
287 {
288         struct bucket_table *tbl;
289         struct rhash_head *pos;
290         struct bkey_cached *ck;
291         unsigned i;
292
293         mutex_lock(&c->lock);
294         rcu_read_lock();
295         tbl = rht_dereference_rcu(c->table.tbl, &c->table);
296         for (i = 0; i < tbl->size; i++)
297                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
298                         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
299                             bkey_cached_lock_for_evict(ck)) {
300                                 bkey_cached_evict(c, ck);
301                                 goto out;
302                         }
303                 }
304         ck = NULL;
305 out:
306         rcu_read_unlock();
307         mutex_unlock(&c->lock);
308         return ck;
309 }
310
311 static struct bkey_cached *
312 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
313 {
314         struct bch_fs *c = trans->c;
315         struct btree_key_cache *bc = &c->btree_key_cache;
316         struct bkey_cached *ck;
317         bool was_new = false;
318
319         ck = bkey_cached_alloc(trans, path, &was_new);
320         if (IS_ERR(ck))
321                 return ck;
322
323         if (unlikely(!ck)) {
324                 ck = bkey_cached_reuse(bc);
325                 if (unlikely(!ck)) {
326                         bch_err(c, "error allocating memory for key cache item, btree %s",
327                                 bch2_btree_ids[path->btree_id]);
328                         return ERR_PTR(-ENOMEM);
329                 }
330
331                 mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
332         } else {
333                 if (path->btree_id == BTREE_ID_subvolumes)
334                         six_lock_pcpu_alloc(&ck->c.lock);
335         }
336
337         ck->c.level             = 0;
338         ck->c.btree_id          = path->btree_id;
339         ck->key.btree_id        = path->btree_id;
340         ck->key.pos             = path->pos;
341         ck->valid               = false;
342         ck->flags               = 1U << BKEY_CACHED_ACCESSED;
343
344         if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
345                                           &ck->hash,
346                                           bch2_btree_key_cache_params))) {
347                 /* We raced with another fill: */
348
349                 if (likely(was_new)) {
350                         six_unlock_write(&ck->c.lock);
351                         six_unlock_intent(&ck->c.lock);
352                         kfree(ck);
353                 } else {
354                         bkey_cached_free_fast(bc, ck);
355                 }
356
357                 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
358                 return NULL;
359         }
360
361         atomic_long_inc(&bc->nr_keys);
362
363         six_unlock_write(&ck->c.lock);
364
365         return ck;
366 }
367
368 static int btree_key_cache_fill(struct btree_trans *trans,
369                                 struct btree_path *ck_path,
370                                 struct bkey_cached *ck)
371 {
372         struct btree_path *path;
373         struct bkey_s_c k;
374         unsigned new_u64s = 0;
375         struct bkey_i *new_k = NULL;
376         struct bkey u;
377         int ret;
378
379         path = bch2_path_get(trans, ck->key.btree_id,
380                              ck->key.pos, 0, 0, 0, _THIS_IP_);
381         ret = bch2_btree_path_traverse(trans, path, 0);
382         if (ret)
383                 goto err;
384
385         k = bch2_btree_path_peek_slot(path, &u);
386
387         if (!bch2_btree_node_relock(trans, ck_path, 0)) {
388                 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
389                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
390                 goto err;
391         }
392
393         /*
394          * bch2_varint_decode can read past the end of the buffer by at
395          * most 7 bytes (it won't be used):
396          */
397         new_u64s = k.k->u64s + 1;
398
399         /*
400          * Allocate some extra space so that the transaction commit path is less
401          * likely to have to reallocate, since that requires a transaction
402          * restart:
403          */
404         new_u64s = min(256U, (new_u64s * 3) / 2);
405
406         if (new_u64s > ck->u64s) {
407                 new_u64s = roundup_pow_of_two(new_u64s);
408                 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
409                 if (!new_k) {
410                         bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
411                                 bch2_btree_ids[ck->key.btree_id], new_u64s);
412                         ret = -ENOMEM;
413                         goto err;
414                 }
415         }
416
417         ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
418         if (ret) {
419                 kfree(new_k);
420                 goto err;
421         }
422
423         if (new_k) {
424                 kfree(ck->k);
425                 ck->u64s = new_u64s;
426                 ck->k = new_k;
427         }
428
429         bkey_reassemble(ck->k, k);
430         ck->valid = true;
431         bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
432
433         /* We're not likely to need this iterator again: */
434         path->preserve = false;
435 err:
436         bch2_path_put(trans, path, 0);
437         return ret;
438 }
439
440 static noinline int
441 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
442                                          unsigned flags)
443 {
444         struct bch_fs *c = trans->c;
445         struct bkey_cached *ck;
446         int ret = 0;
447
448         BUG_ON(path->level);
449
450         path->l[1].b = NULL;
451
452         if (bch2_btree_node_relock(trans, path, 0)) {
453                 ck = (void *) path->l[0].b;
454                 goto fill;
455         }
456 retry:
457         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
458         if (!ck) {
459                 ck = btree_key_cache_create(trans, path);
460                 ret = PTR_ERR_OR_ZERO(ck);
461                 if (ret)
462                         goto err;
463                 if (!ck)
464                         goto retry;
465
466                 mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
467                 path->locks_want = 1;
468         } else {
469                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
470
471                 ret = btree_node_lock(trans, path, (void *) ck, 0,
472                                       lock_want, _THIS_IP_);
473                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
474                         goto err;
475
476                 BUG_ON(ret);
477
478                 if (ck->key.btree_id != path->btree_id ||
479                     bpos_cmp(ck->key.pos, path->pos)) {
480                         six_unlock_type(&ck->c.lock, lock_want);
481                         goto retry;
482                 }
483
484                 mark_btree_node_locked(trans, path, 0, lock_want);
485         }
486
487         path->l[0].lock_seq     = ck->c.lock.state.seq;
488         path->l[0].b            = (void *) ck;
489 fill:
490         if (!ck->valid) {
491                 /*
492                  * Using the underscore version because we haven't set
493                  * path->uptodate yet:
494                  */
495                 if (!path->locks_want &&
496                     !__bch2_btree_path_upgrade(trans, path, 1)) {
497                         trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
498                         ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
499                         goto err;
500                 }
501
502                 ret = btree_key_cache_fill(trans, path, ck);
503                 if (ret)
504                         goto err;
505         }
506
507         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
508                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
509
510         path->uptodate = BTREE_ITER_UPTODATE;
511         BUG_ON(!ck->valid);
512         BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
513
514         return ret;
515 err:
516         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
517                 btree_node_unlock(trans, path, 0);
518                 path->l[0].b = ERR_PTR(ret);
519         }
520         return ret;
521 }
522
523 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
524                                     unsigned flags)
525 {
526         struct bch_fs *c = trans->c;
527         struct bkey_cached *ck;
528         int ret = 0;
529
530         EBUG_ON(path->level);
531
532         path->l[1].b = NULL;
533
534         if (bch2_btree_node_relock(trans, path, 0)) {
535                 ck = (void *) path->l[0].b;
536                 goto fill;
537         }
538 retry:
539         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
540         if (!ck) {
541                 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
542         } else {
543                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
544
545                 ret = btree_node_lock(trans, path, (void *) ck, 0,
546                                       lock_want, _THIS_IP_);
547                 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
548
549                 if (ret)
550                         return ret;
551
552                 if (ck->key.btree_id != path->btree_id ||
553                     bpos_cmp(ck->key.pos, path->pos)) {
554                         six_unlock_type(&ck->c.lock, lock_want);
555                         goto retry;
556                 }
557
558                 mark_btree_node_locked(trans, path, 0, lock_want);
559         }
560
561         path->l[0].lock_seq     = ck->c.lock.state.seq;
562         path->l[0].b            = (void *) ck;
563 fill:
564         if (!ck->valid)
565                 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
566
567         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
568                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
569
570         path->uptodate = BTREE_ITER_UPTODATE;
571         EBUG_ON(!ck->valid);
572         EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
573
574         return ret;
575 }
576
577 static int btree_key_cache_flush_pos(struct btree_trans *trans,
578                                      struct bkey_cached_key key,
579                                      u64 journal_seq,
580                                      unsigned commit_flags,
581                                      bool evict)
582 {
583         struct bch_fs *c = trans->c;
584         struct journal *j = &c->journal;
585         struct btree_iter c_iter, b_iter;
586         struct bkey_cached *ck = NULL;
587         int ret;
588
589         bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
590                              BTREE_ITER_SLOTS|
591                              BTREE_ITER_INTENT|
592                              BTREE_ITER_ALL_SNAPSHOTS);
593         bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
594                              BTREE_ITER_CACHED|
595                              BTREE_ITER_INTENT);
596         b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
597
598         ret = bch2_btree_iter_traverse(&c_iter);
599         if (ret)
600                 goto out;
601
602         ck = (void *) c_iter.path->l[0].b;
603         if (!ck)
604                 goto out;
605
606         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
607                 if (evict)
608                         goto evict;
609                 goto out;
610         }
611
612         BUG_ON(!ck->valid);
613
614         if (journal_seq && ck->journal.seq != journal_seq)
615                 goto out;
616
617         /*
618          * Since journal reclaim depends on us making progress here, and the
619          * allocator/copygc depend on journal reclaim making progress, we need
620          * to be using alloc reserves:
621          */
622         ret   = bch2_btree_iter_traverse(&b_iter) ?:
623                 bch2_trans_update(trans, &b_iter, ck->k,
624                                   BTREE_UPDATE_KEY_CACHE_RECLAIM|
625                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
626                                   BTREE_TRIGGER_NORUN) ?:
627                 bch2_trans_commit(trans, NULL, NULL,
628                                   BTREE_INSERT_NOCHECK_RW|
629                                   BTREE_INSERT_NOFAIL|
630                                   BTREE_INSERT_USE_RESERVE|
631                                   (ck->journal.seq == journal_last_seq(j)
632                                    ? JOURNAL_WATERMARK_reserved
633                                    : 0)|
634                                   commit_flags);
635
636         bch2_fs_fatal_err_on(ret &&
637                              !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
638                              !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
639                              !bch2_journal_error(j), c,
640                              "error flushing key cache: %s", bch2_err_str(ret));
641         if (ret)
642                 goto out;
643
644         bch2_journal_pin_drop(j, &ck->journal);
645         bch2_journal_preres_put(j, &ck->res);
646
647         BUG_ON(!btree_node_locked(c_iter.path, 0));
648
649         if (!evict) {
650                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
651                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
652                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
653                 }
654         } else {
655                 struct btree_path *path2;
656 evict:
657                 trans_for_each_path(trans, path2)
658                         if (path2 != c_iter.path)
659                                 __bch2_btree_path_unlock(trans, path2);
660
661                 bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
662
663                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
664                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
665                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
666                 }
667
668                 mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
669                 bkey_cached_evict(&c->btree_key_cache, ck);
670                 bkey_cached_free_fast(&c->btree_key_cache, ck);
671         }
672 out:
673         bch2_trans_iter_exit(trans, &b_iter);
674         bch2_trans_iter_exit(trans, &c_iter);
675         return ret;
676 }
677
678 int bch2_btree_key_cache_journal_flush(struct journal *j,
679                                 struct journal_entry_pin *pin, u64 seq)
680 {
681         struct bch_fs *c = container_of(j, struct bch_fs, journal);
682         struct bkey_cached *ck =
683                 container_of(pin, struct bkey_cached, journal);
684         struct bkey_cached_key key;
685         struct btree_trans trans;
686         int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
687         int ret = 0;
688
689         bch2_trans_init(&trans, c, 0, 0);
690
691         btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
692         key = ck->key;
693
694         if (ck->journal.seq != seq ||
695             !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
696                 six_unlock_read(&ck->c.lock);
697                 goto unlock;
698         }
699         six_unlock_read(&ck->c.lock);
700
701         ret = commit_do(&trans, NULL, NULL, 0,
702                 btree_key_cache_flush_pos(&trans, key, seq,
703                                 BTREE_INSERT_JOURNAL_RECLAIM, false));
704 unlock:
705         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
706
707         bch2_trans_exit(&trans);
708         return ret;
709 }
710
711 /*
712  * Flush and evict a key from the key cache:
713  */
714 int bch2_btree_key_cache_flush(struct btree_trans *trans,
715                                enum btree_id id, struct bpos pos)
716 {
717         struct bch_fs *c = trans->c;
718         struct bkey_cached_key key = { id, pos };
719
720         /* Fastpath - assume it won't be found: */
721         if (!bch2_btree_key_cache_find(c, id, pos))
722                 return 0;
723
724         return btree_key_cache_flush_pos(trans, key, 0, 0, true);
725 }
726
727 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
728                                   struct btree_path *path,
729                                   struct bkey_i *insert)
730 {
731         struct bch_fs *c = trans->c;
732         struct bkey_cached *ck = (void *) path->l[0].b;
733         bool kick_reclaim = false;
734
735         BUG_ON(insert->u64s > ck->u64s);
736
737         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
738                 int difference;
739
740                 BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
741
742                 difference = jset_u64s(insert->u64s) - ck->res.u64s;
743                 if (difference > 0) {
744                         trans->journal_preres.u64s      -= difference;
745                         ck->res.u64s                    += difference;
746                 }
747         }
748
749         bkey_copy(ck->k, insert);
750         ck->valid = true;
751
752         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
753                 set_bit(BKEY_CACHED_DIRTY, &ck->flags);
754                 atomic_long_inc(&c->btree_key_cache.nr_dirty);
755
756                 if (bch2_nr_btree_keys_need_flush(c))
757                         kick_reclaim = true;
758         }
759
760         bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
761                                 &ck->journal, bch2_btree_key_cache_journal_flush);
762
763         if (kick_reclaim)
764                 journal_reclaim_kick(&c->journal);
765         return true;
766 }
767
768 void bch2_btree_key_cache_drop(struct btree_trans *trans,
769                                struct btree_path *path)
770 {
771         struct bch_fs *c = trans->c;
772         struct bkey_cached *ck = (void *) path->l[0].b;
773
774         BUG_ON(!ck->valid);
775
776         /*
777          * We just did an update to the btree, bypassing the key cache: the key
778          * cache key is now stale and must be dropped, even if dirty:
779          */
780         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
781                 clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
782                 atomic_long_dec(&c->btree_key_cache.nr_dirty);
783                 bch2_journal_pin_drop(&c->journal, &ck->journal);
784         }
785
786         ck->valid = false;
787 }
788
789 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
790                                            struct shrink_control *sc)
791 {
792         struct bch_fs *c = container_of(shrink, struct bch_fs,
793                                         btree_key_cache.shrink);
794         struct btree_key_cache *bc = &c->btree_key_cache;
795         struct bucket_table *tbl;
796         struct bkey_cached *ck, *t;
797         size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
798         unsigned start, flags;
799         int srcu_idx;
800
801         mutex_lock(&bc->lock);
802         srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
803         flags = memalloc_nofs_save();
804
805         /*
806          * Newest freed entries are at the end of the list - once we hit one
807          * that's too new to be freed, we can bail out:
808          */
809         list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
810                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
811                                                  ck->btree_trans_barrier_seq))
812                         break;
813
814                 list_del(&ck->list);
815                 six_lock_pcpu_free(&ck->c.lock);
816                 kmem_cache_free(bch2_key_cache, ck);
817                 atomic_long_dec(&bc->nr_freed);
818                 scanned++;
819                 freed++;
820         }
821
822         if (scanned >= nr)
823                 goto out;
824
825         list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
826                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
827                                                  ck->btree_trans_barrier_seq))
828                         break;
829
830                 list_del(&ck->list);
831                 six_lock_pcpu_free(&ck->c.lock);
832                 kmem_cache_free(bch2_key_cache, ck);
833                 atomic_long_dec(&bc->nr_freed);
834                 scanned++;
835                 freed++;
836         }
837
838         if (scanned >= nr)
839                 goto out;
840
841         rcu_read_lock();
842         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
843         if (bc->shrink_iter >= tbl->size)
844                 bc->shrink_iter = 0;
845         start = bc->shrink_iter;
846
847         do {
848                 struct rhash_head *pos, *next;
849
850                 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
851
852                 while (!rht_is_a_nulls(pos)) {
853                         next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
854                         ck = container_of(pos, struct bkey_cached, hash);
855
856                         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
857                                 goto next;
858
859                         if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
860                                 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
861                         else if (bkey_cached_lock_for_evict(ck)) {
862                                 bkey_cached_evict(bc, ck);
863                                 bkey_cached_free(bc, ck);
864                         }
865
866                         scanned++;
867                         if (scanned >= nr)
868                                 break;
869 next:
870                         pos = next;
871                 }
872
873                 bc->shrink_iter++;
874                 if (bc->shrink_iter >= tbl->size)
875                         bc->shrink_iter = 0;
876         } while (scanned < nr && bc->shrink_iter != start);
877
878         rcu_read_unlock();
879 out:
880         memalloc_nofs_restore(flags);
881         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
882         mutex_unlock(&bc->lock);
883
884         return freed;
885 }
886
887 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
888                                             struct shrink_control *sc)
889 {
890         struct bch_fs *c = container_of(shrink, struct bch_fs,
891                                         btree_key_cache.shrink);
892         struct btree_key_cache *bc = &c->btree_key_cache;
893         long nr = atomic_long_read(&bc->nr_keys) -
894                 atomic_long_read(&bc->nr_dirty);
895
896         return max(0L, nr);
897 }
898
899 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
900 {
901         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
902         struct bucket_table *tbl;
903         struct bkey_cached *ck, *n;
904         struct rhash_head *pos;
905         unsigned i;
906 #ifdef __KERNEL__
907         int cpu;
908 #endif
909
910         if (bc->shrink.list.next)
911                 unregister_shrinker(&bc->shrink);
912
913         mutex_lock(&bc->lock);
914
915         /*
916          * The loop is needed to guard against racing with rehash:
917          */
918         while (atomic_long_read(&bc->nr_keys)) {
919                 rcu_read_lock();
920                 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
921                 if (tbl)
922                         for (i = 0; i < tbl->size; i++)
923                                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
924                                         bkey_cached_evict(bc, ck);
925                                         list_add(&ck->list, &bc->freed_nonpcpu);
926                                 }
927                 rcu_read_unlock();
928         }
929
930 #ifdef __KERNEL__
931         for_each_possible_cpu(cpu) {
932                 struct btree_key_cache_freelist *f =
933                         per_cpu_ptr(bc->pcpu_freed, cpu);
934
935                 for (i = 0; i < f->nr; i++) {
936                         ck = f->objs[i];
937                         list_add(&ck->list, &bc->freed_nonpcpu);
938                 }
939         }
940 #endif
941
942         list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
943
944         list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) {
945                 cond_resched();
946
947                 bch2_journal_pin_drop(&c->journal, &ck->journal);
948                 bch2_journal_preres_put(&c->journal, &ck->res);
949
950                 list_del(&ck->list);
951                 kfree(ck->k);
952                 six_lock_pcpu_free(&ck->c.lock);
953                 kmem_cache_free(bch2_key_cache, ck);
954         }
955
956         if (atomic_long_read(&bc->nr_dirty) &&
957             !bch2_journal_error(&c->journal) &&
958             test_bit(BCH_FS_WAS_RW, &c->flags))
959                 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
960                       atomic_long_read(&bc->nr_dirty));
961
962         if (atomic_long_read(&bc->nr_keys))
963                 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
964                       atomic_long_read(&bc->nr_keys));
965
966         mutex_unlock(&bc->lock);
967
968         if (bc->table_init_done)
969                 rhashtable_destroy(&bc->table);
970
971         free_percpu(bc->pcpu_freed);
972 }
973
974 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
975 {
976         mutex_init(&c->lock);
977         INIT_LIST_HEAD(&c->freed_pcpu);
978         INIT_LIST_HEAD(&c->freed_nonpcpu);
979 }
980
981 static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
982 {
983         struct btree_key_cache *bc =
984                 container_of(shrink, struct btree_key_cache, shrink);
985
986         bch2_btree_key_cache_to_text(out, bc);
987 }
988
989 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
990 {
991         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
992         int ret;
993
994 #ifdef __KERNEL__
995         bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
996         if (!bc->pcpu_freed)
997                 return -ENOMEM;
998 #endif
999
1000         ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params);
1001         if (ret)
1002                 return ret;
1003
1004         bc->table_init_done = true;
1005
1006         bc->shrink.seeks                = 0;
1007         bc->shrink.count_objects        = bch2_btree_key_cache_count;
1008         bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
1009         bc->shrink.to_text              = bch2_btree_key_cache_shrinker_to_text;
1010         return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name);
1011 }
1012
1013 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
1014 {
1015         prt_printf(out, "nr_freed:\t%zu",       atomic_long_read(&c->nr_freed));
1016         prt_newline(out);
1017         prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
1018         prt_newline(out);
1019         prt_printf(out, "nr_dirty:\t%lu",       atomic_long_read(&c->nr_dirty));
1020         prt_newline(out);
1021 }
1022
1023 void bch2_btree_key_cache_exit(void)
1024 {
1025         kmem_cache_destroy(bch2_key_cache);
1026 }
1027
1028 int __init bch2_btree_key_cache_init(void)
1029 {
1030         bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
1031         if (!bch2_key_cache)
1032                 return -ENOMEM;
1033
1034         return 0;
1035 }