]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_key_cache.c
Update bcachefs sources to 95ff72a6c1 fixup! mm: Centralize & improve oom reporting...
[bcachefs-tools-debian] / libbcachefs / btree_key_cache.c
1
2 #include "bcachefs.h"
3 #include "btree_cache.h"
4 #include "btree_iter.h"
5 #include "btree_key_cache.h"
6 #include "btree_locking.h"
7 #include "btree_update.h"
8 #include "error.h"
9 #include "journal.h"
10 #include "journal_reclaim.h"
11
12 #include <linux/sched/mm.h>
13 #include <trace/events/bcachefs.h>
14
15 static struct kmem_cache *bch2_key_cache;
16
17 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
18                                        const void *obj)
19 {
20         const struct bkey_cached *ck = obj;
21         const struct bkey_cached_key *key = arg->key;
22
23         return cmp_int(ck->key.btree_id, key->btree_id) ?:
24                 bpos_cmp(ck->key.pos, key->pos);
25 }
26
27 static const struct rhashtable_params bch2_btree_key_cache_params = {
28         .head_offset    = offsetof(struct bkey_cached, hash),
29         .key_offset     = offsetof(struct bkey_cached, key),
30         .key_len        = sizeof(struct bkey_cached_key),
31         .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
32 };
33
34 __flatten
35 inline struct bkey_cached *
36 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
37 {
38         struct bkey_cached_key key = {
39                 .btree_id       = btree_id,
40                 .pos            = pos,
41         };
42
43         return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
44                                       bch2_btree_key_cache_params);
45 }
46
47 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
48 {
49         if (!six_trylock_intent(&ck->c.lock))
50                 return false;
51
52         if (!six_trylock_write(&ck->c.lock)) {
53                 six_unlock_intent(&ck->c.lock);
54                 return false;
55         }
56
57         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
58                 six_unlock_write(&ck->c.lock);
59                 six_unlock_intent(&ck->c.lock);
60                 return false;
61         }
62
63         return true;
64 }
65
66 static void bkey_cached_evict(struct btree_key_cache *c,
67                               struct bkey_cached *ck)
68 {
69         BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
70                                       bch2_btree_key_cache_params));
71         memset(&ck->key, ~0, sizeof(ck->key));
72
73         atomic_long_dec(&c->nr_keys);
74 }
75
76 static void bkey_cached_free(struct btree_key_cache *bc,
77                              struct bkey_cached *ck)
78 {
79         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
80
81         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
82
83         ck->btree_trans_barrier_seq =
84                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
85
86         list_move_tail(&ck->list, &bc->freed);
87         atomic_long_inc(&bc->nr_freed);
88
89         kfree(ck->k);
90         ck->k           = NULL;
91         ck->u64s        = 0;
92
93         six_unlock_write(&ck->c.lock);
94         six_unlock_intent(&ck->c.lock);
95 }
96
97 static void bkey_cached_free_fast(struct btree_key_cache *bc,
98                                   struct bkey_cached *ck)
99 {
100         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
101         struct btree_key_cache_freelist *f;
102         bool freed = false;
103
104         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
105
106         ck->btree_trans_barrier_seq =
107                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
108
109         list_del_init(&ck->list);
110         atomic_long_inc(&bc->nr_freed);
111
112         kfree(ck->k);
113         ck->k           = NULL;
114         ck->u64s        = 0;
115
116         preempt_disable();
117         f = this_cpu_ptr(bc->pcpu_freed);
118
119         if (f->nr < ARRAY_SIZE(f->objs)) {
120                 f->objs[f->nr++] = ck;
121                 freed = true;
122         }
123         preempt_enable();
124
125         if (!freed) {
126                 mutex_lock(&bc->lock);
127                 preempt_disable();
128                 f = this_cpu_ptr(bc->pcpu_freed);
129
130                 while (f->nr > ARRAY_SIZE(f->objs) / 2) {
131                         struct bkey_cached *ck2 = f->objs[--f->nr];
132
133                         list_move_tail(&ck2->list, &bc->freed);
134                 }
135                 preempt_enable();
136
137                 list_move_tail(&ck->list, &bc->freed);
138                 mutex_unlock(&bc->lock);
139         }
140
141         six_unlock_write(&ck->c.lock);
142         six_unlock_intent(&ck->c.lock);
143 }
144
145 static struct bkey_cached *
146 bkey_cached_alloc(struct btree_key_cache *c)
147 {
148         struct bkey_cached *ck = NULL;
149         struct btree_key_cache_freelist *f;
150
151         preempt_disable();
152         f = this_cpu_ptr(c->pcpu_freed);
153         if (f->nr)
154                 ck = f->objs[--f->nr];
155         preempt_enable();
156
157         if (!ck) {
158                 mutex_lock(&c->lock);
159                 preempt_disable();
160                 f = this_cpu_ptr(c->pcpu_freed);
161
162                 while (!list_empty(&c->freed) &&
163                        f->nr < ARRAY_SIZE(f->objs) / 2) {
164                         ck = list_last_entry(&c->freed, struct bkey_cached, list);
165                         list_del_init(&ck->list);
166                         f->objs[f->nr++] = ck;
167                 }
168
169                 ck = f->nr ? f->objs[--f->nr] : NULL;
170                 preempt_enable();
171                 mutex_unlock(&c->lock);
172         }
173
174         if (ck) {
175                 six_lock_intent(&ck->c.lock, NULL, NULL);
176                 six_lock_write(&ck->c.lock, NULL, NULL);
177                 return ck;
178         }
179
180         ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
181         if (likely(ck)) {
182                 INIT_LIST_HEAD(&ck->list);
183                 six_lock_init(&ck->c.lock);
184                 BUG_ON(!six_trylock_intent(&ck->c.lock));
185                 BUG_ON(!six_trylock_write(&ck->c.lock));
186                 return ck;
187         }
188
189         return NULL;
190 }
191
192 static struct bkey_cached *
193 bkey_cached_reuse(struct btree_key_cache *c)
194 {
195         struct bucket_table *tbl;
196         struct rhash_head *pos;
197         struct bkey_cached *ck;
198         unsigned i;
199
200         rcu_read_lock();
201         tbl = rht_dereference_rcu(c->table.tbl, &c->table);
202         for (i = 0; i < tbl->size; i++)
203                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
204                         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
205                             bkey_cached_lock_for_evict(ck)) {
206                                 bkey_cached_evict(c, ck);
207                                 rcu_read_unlock();
208                                 return ck;
209                         }
210                 }
211         rcu_read_unlock();
212
213         return NULL;
214 }
215
216 static struct bkey_cached *
217 btree_key_cache_create(struct bch_fs *c,
218                        enum btree_id btree_id,
219                        struct bpos pos)
220 {
221         struct btree_key_cache *bc = &c->btree_key_cache;
222         struct bkey_cached *ck;
223         bool was_new = true;
224
225         ck = bkey_cached_alloc(bc);
226
227         if (unlikely(!ck)) {
228                 ck = bkey_cached_reuse(bc);
229                 if (unlikely(!ck)) {
230                         bch_err(c, "error allocating memory for key cache item, btree %s",
231                                 bch2_btree_ids[btree_id]);
232                         return ERR_PTR(-ENOMEM);
233                 }
234
235                 was_new = false;
236         } else {
237                 if (btree_id == BTREE_ID_subvolumes)
238                         six_lock_pcpu_alloc(&ck->c.lock);
239                 else
240                         six_lock_pcpu_free(&ck->c.lock);
241         }
242
243         ck->c.level             = 0;
244         ck->c.btree_id          = btree_id;
245         ck->key.btree_id        = btree_id;
246         ck->key.pos             = pos;
247         ck->valid               = false;
248         ck->flags               = 1U << BKEY_CACHED_ACCESSED;
249
250         if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
251                                           &ck->hash,
252                                           bch2_btree_key_cache_params))) {
253                 /* We raced with another fill: */
254
255                 if (likely(was_new)) {
256                         six_unlock_write(&ck->c.lock);
257                         six_unlock_intent(&ck->c.lock);
258                         kfree(ck);
259                 } else {
260                         bkey_cached_free_fast(bc, ck);
261                 }
262
263                 return NULL;
264         }
265
266         atomic_long_inc(&bc->nr_keys);
267
268         six_unlock_write(&ck->c.lock);
269
270         return ck;
271 }
272
273 static int btree_key_cache_fill(struct btree_trans *trans,
274                                 struct btree_path *ck_path,
275                                 struct bkey_cached *ck)
276 {
277         struct btree_path *path;
278         struct bkey_s_c k;
279         unsigned new_u64s = 0;
280         struct bkey_i *new_k = NULL;
281         struct bkey u;
282         int ret;
283
284         path = bch2_path_get(trans, ck->key.btree_id,
285                              ck->key.pos, 0, 0, 0, _THIS_IP_);
286         ret = bch2_btree_path_traverse(trans, path, 0);
287         if (ret)
288                 goto err;
289
290         k = bch2_btree_path_peek_slot(path, &u);
291
292         if (!bch2_btree_node_relock(trans, ck_path, 0)) {
293                 trace_trans_restart_relock_key_cache_fill(trans->fn,
294                                 _THIS_IP_, ck_path->btree_id, &ck_path->pos);
295                 ret = btree_trans_restart(trans);
296                 goto err;
297         }
298
299         /*
300          * bch2_varint_decode can read past the end of the buffer by at
301          * most 7 bytes (it won't be used):
302          */
303         new_u64s = k.k->u64s + 1;
304
305         /*
306          * Allocate some extra space so that the transaction commit path is less
307          * likely to have to reallocate, since that requires a transaction
308          * restart:
309          */
310         new_u64s = min(256U, (new_u64s * 3) / 2);
311
312         if (new_u64s > ck->u64s) {
313                 new_u64s = roundup_pow_of_two(new_u64s);
314                 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
315                 if (!new_k) {
316                         bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
317                                 bch2_btree_ids[ck->key.btree_id], new_u64s);
318                         ret = -ENOMEM;
319                         goto err;
320                 }
321         }
322
323         /*
324          * XXX: not allowed to be holding read locks when we take a write lock,
325          * currently
326          */
327         bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
328         if (new_k) {
329                 kfree(ck->k);
330                 ck->u64s = new_u64s;
331                 ck->k = new_k;
332         }
333
334         bkey_reassemble(ck->k, k);
335         ck->valid = true;
336         bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
337
338         /* We're not likely to need this iterator again: */
339         path->preserve = false;
340 err:
341         bch2_path_put(trans, path, 0);
342         return ret;
343 }
344
345 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
346 {
347         struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
348         const struct btree_path *path = p;
349
350         return ck->key.btree_id == path->btree_id &&
351                 !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
352 }
353
354 __flatten
355 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
356                                     unsigned flags)
357 {
358         struct bch_fs *c = trans->c;
359         struct bkey_cached *ck;
360         int ret = 0;
361
362         BUG_ON(path->level);
363
364         path->l[1].b = NULL;
365
366         if (bch2_btree_node_relock(trans, path, 0)) {
367                 ck = (void *) path->l[0].b;
368                 goto fill;
369         }
370 retry:
371         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
372         if (!ck) {
373                 if (flags & BTREE_ITER_CACHED_NOCREATE) {
374                         path->l[0].b = NULL;
375                         return 0;
376                 }
377
378                 ck = btree_key_cache_create(c, path->btree_id, path->pos);
379                 ret = PTR_ERR_OR_ZERO(ck);
380                 if (ret)
381                         goto err;
382                 if (!ck)
383                         goto retry;
384
385                 mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
386                 path->locks_want = 1;
387         } else {
388                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
389
390                 if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
391                                      lock_want,
392                                      bkey_cached_check_fn, path, _THIS_IP_)) {
393                         if (!trans->restarted)
394                                 goto retry;
395
396                         ret = -EINTR;
397                         goto err;
398                 }
399
400                 if (ck->key.btree_id != path->btree_id ||
401                     bpos_cmp(ck->key.pos, path->pos)) {
402                         six_unlock_type(&ck->c.lock, lock_want);
403                         goto retry;
404                 }
405
406                 mark_btree_node_locked(trans, path, 0, lock_want);
407         }
408
409         path->l[0].lock_seq     = ck->c.lock.state.seq;
410         path->l[0].b            = (void *) ck;
411 fill:
412         if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
413                 if (!path->locks_want &&
414                     !__bch2_btree_path_upgrade(trans, path, 1)) {
415                         trace_transaction_restart_ip(trans->fn, _THIS_IP_);
416                         ret = btree_trans_restart(trans);
417                         goto err;
418                 }
419
420                 ret = btree_key_cache_fill(trans, path, ck);
421                 if (ret)
422                         goto err;
423         }
424
425         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
426                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
427
428         path->uptodate = BTREE_ITER_UPTODATE;
429         BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
430
431         return ret;
432 err:
433         if (ret != -EINTR) {
434                 btree_node_unlock(path, 0);
435                 path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
436         }
437         return ret;
438 }
439
440 static int btree_key_cache_flush_pos(struct btree_trans *trans,
441                                      struct bkey_cached_key key,
442                                      u64 journal_seq,
443                                      unsigned commit_flags,
444                                      bool evict)
445 {
446         struct bch_fs *c = trans->c;
447         struct journal *j = &c->journal;
448         struct btree_iter c_iter, b_iter;
449         struct bkey_cached *ck = NULL;
450         int ret;
451
452         bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
453                              BTREE_ITER_SLOTS|
454                              BTREE_ITER_INTENT|
455                              BTREE_ITER_ALL_SNAPSHOTS);
456         bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
457                              BTREE_ITER_CACHED|
458                              BTREE_ITER_CACHED_NOFILL|
459                              BTREE_ITER_CACHED_NOCREATE|
460                              BTREE_ITER_INTENT);
461         b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
462
463         ret = bch2_btree_iter_traverse(&c_iter);
464         if (ret)
465                 goto out;
466
467         ck = (void *) c_iter.path->l[0].b;
468         if (!ck)
469                 goto out;
470
471         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
472                 if (evict)
473                         goto evict;
474                 goto out;
475         }
476
477         BUG_ON(!ck->valid);
478
479         if (journal_seq && ck->journal.seq != journal_seq)
480                 goto out;
481
482         /*
483          * Since journal reclaim depends on us making progress here, and the
484          * allocator/copygc depend on journal reclaim making progress, we need
485          * to be using alloc reserves:
486          * */
487         ret   = bch2_btree_iter_traverse(&b_iter) ?:
488                 bch2_trans_update(trans, &b_iter, ck->k,
489                                   BTREE_UPDATE_KEY_CACHE_RECLAIM|
490                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
491                                   BTREE_TRIGGER_NORUN) ?:
492                 bch2_trans_commit(trans, NULL, NULL,
493                                   BTREE_INSERT_NOCHECK_RW|
494                                   BTREE_INSERT_NOFAIL|
495                                   BTREE_INSERT_USE_RESERVE|
496                                   (ck->journal.seq == journal_last_seq(j)
497                                    ? JOURNAL_WATERMARK_reserved
498                                    : 0)|
499                                   commit_flags);
500         if (ret) {
501                 bch2_fs_fatal_err_on(ret != -EINTR &&
502                                      ret != -EAGAIN &&
503                                      !bch2_journal_error(j), c,
504                         "error flushing key cache: %i", ret);
505                 goto out;
506         }
507
508         bch2_journal_pin_drop(j, &ck->journal);
509         bch2_journal_preres_put(j, &ck->res);
510
511         BUG_ON(!btree_node_locked(c_iter.path, 0));
512
513         if (!evict) {
514                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
515                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
516                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
517                 }
518         } else {
519 evict:
520                 BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
521
522                 mark_btree_node_unlocked(c_iter.path, 0);
523                 c_iter.path->l[0].b = NULL;
524
525                 six_lock_write(&ck->c.lock, NULL, NULL);
526
527                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
528                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
529                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
530                 }
531
532                 bkey_cached_evict(&c->btree_key_cache, ck);
533
534                 bkey_cached_free_fast(&c->btree_key_cache, ck);
535         }
536 out:
537         bch2_trans_iter_exit(trans, &b_iter);
538         bch2_trans_iter_exit(trans, &c_iter);
539         return ret;
540 }
541
542 int bch2_btree_key_cache_journal_flush(struct journal *j,
543                                 struct journal_entry_pin *pin, u64 seq)
544 {
545         struct bch_fs *c = container_of(j, struct bch_fs, journal);
546         struct bkey_cached *ck =
547                 container_of(pin, struct bkey_cached, journal);
548         struct bkey_cached_key key;
549         int ret = 0;
550
551         int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
552
553         six_lock_read(&ck->c.lock, NULL, NULL);
554         key = ck->key;
555
556         if (ck->journal.seq != seq ||
557             !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
558                 six_unlock_read(&ck->c.lock);
559                 goto unlock;
560         }
561         six_unlock_read(&ck->c.lock);
562
563         ret = bch2_trans_do(c, NULL, NULL, 0,
564                 btree_key_cache_flush_pos(&trans, key, seq,
565                                 BTREE_INSERT_JOURNAL_RECLAIM, false));
566 unlock:
567         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
568
569         return ret;
570 }
571
572 /*
573  * Flush and evict a key from the key cache:
574  */
575 int bch2_btree_key_cache_flush(struct btree_trans *trans,
576                                enum btree_id id, struct bpos pos)
577 {
578         struct bch_fs *c = trans->c;
579         struct bkey_cached_key key = { id, pos };
580
581         /* Fastpath - assume it won't be found: */
582         if (!bch2_btree_key_cache_find(c, id, pos))
583                 return 0;
584
585         return btree_key_cache_flush_pos(trans, key, 0, 0, true);
586 }
587
588 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
589                                   struct btree_path *path,
590                                   struct bkey_i *insert)
591 {
592         struct bch_fs *c = trans->c;
593         struct bkey_cached *ck = (void *) path->l[0].b;
594         bool kick_reclaim = false;
595
596         BUG_ON(insert->u64s > ck->u64s);
597
598         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
599                 int difference;
600
601                 BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
602
603                 difference = jset_u64s(insert->u64s) - ck->res.u64s;
604                 if (difference > 0) {
605                         trans->journal_preres.u64s      -= difference;
606                         ck->res.u64s                    += difference;
607                 }
608         }
609
610         bkey_copy(ck->k, insert);
611         ck->valid = true;
612
613         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
614                 set_bit(BKEY_CACHED_DIRTY, &ck->flags);
615                 atomic_long_inc(&c->btree_key_cache.nr_dirty);
616
617                 if (bch2_nr_btree_keys_need_flush(c))
618                         kick_reclaim = true;
619         }
620
621         bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
622                                 &ck->journal, bch2_btree_key_cache_journal_flush);
623
624         if (kick_reclaim)
625                 journal_reclaim_kick(&c->journal);
626         return true;
627 }
628
629 void bch2_btree_key_cache_drop(struct btree_trans *trans,
630                                struct btree_path *path)
631 {
632         struct bkey_cached *ck = (void *) path->l[0].b;
633
634         ck->valid = false;
635
636         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
637 }
638
639 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
640                                            struct shrink_control *sc)
641 {
642         struct bch_fs *c = container_of(shrink, struct bch_fs,
643                                         btree_key_cache.shrink);
644         struct btree_key_cache *bc = &c->btree_key_cache;
645         struct bucket_table *tbl;
646         struct bkey_cached *ck, *t;
647         size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
648         unsigned start, flags;
649         int srcu_idx;
650
651         /* Return -1 if we can't do anything right now */
652         if (sc->gfp_mask & __GFP_FS)
653                 mutex_lock(&bc->lock);
654         else if (!mutex_trylock(&bc->lock))
655                 return -1;
656
657         srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
658         flags = memalloc_nofs_save();
659
660         /*
661          * Newest freed entries are at the end of the list - once we hit one
662          * that's too new to be freed, we can bail out:
663          */
664         list_for_each_entry_safe(ck, t, &bc->freed, list) {
665                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
666                                                  ck->btree_trans_barrier_seq))
667                         break;
668
669                 list_del(&ck->list);
670                 kmem_cache_free(bch2_key_cache, ck);
671                 atomic_long_dec(&bc->nr_freed);
672                 scanned++;
673                 freed++;
674         }
675
676         if (scanned >= nr)
677                 goto out;
678
679         rcu_read_lock();
680         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
681         if (bc->shrink_iter >= tbl->size)
682                 bc->shrink_iter = 0;
683         start = bc->shrink_iter;
684
685         do {
686                 struct rhash_head *pos, *next;
687
688                 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
689
690                 while (!rht_is_a_nulls(pos)) {
691                         next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
692                         ck = container_of(pos, struct bkey_cached, hash);
693
694                         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
695                                 goto next;
696
697                         if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
698                                 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
699                         else if (bkey_cached_lock_for_evict(ck)) {
700                                 bkey_cached_evict(bc, ck);
701                                 bkey_cached_free(bc, ck);
702                         }
703
704                         scanned++;
705                         if (scanned >= nr)
706                                 break;
707 next:
708                         pos = next;
709                 }
710
711                 bc->shrink_iter++;
712                 if (bc->shrink_iter >= tbl->size)
713                         bc->shrink_iter = 0;
714         } while (scanned < nr && bc->shrink_iter != start);
715
716         rcu_read_unlock();
717 out:
718         memalloc_nofs_restore(flags);
719         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
720         mutex_unlock(&bc->lock);
721
722         return freed;
723 }
724
725 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
726                                             struct shrink_control *sc)
727 {
728         struct bch_fs *c = container_of(shrink, struct bch_fs,
729                                         btree_key_cache.shrink);
730         struct btree_key_cache *bc = &c->btree_key_cache;
731         long nr = atomic_long_read(&bc->nr_keys) -
732                 atomic_long_read(&bc->nr_dirty);
733
734         return max(0L, nr);
735 }
736
737 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
738 {
739         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
740         struct bucket_table *tbl;
741         struct bkey_cached *ck, *n;
742         struct rhash_head *pos;
743         unsigned i;
744         int cpu;
745
746         if (bc->shrink.list.next)
747                 unregister_shrinker(&bc->shrink);
748
749         mutex_lock(&bc->lock);
750
751         rcu_read_lock();
752         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
753         if (tbl)
754                 for (i = 0; i < tbl->size; i++)
755                         rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
756                                 bkey_cached_evict(bc, ck);
757                                 list_add(&ck->list, &bc->freed);
758                         }
759         rcu_read_unlock();
760
761         for_each_possible_cpu(cpu) {
762                 struct btree_key_cache_freelist *f =
763                         per_cpu_ptr(bc->pcpu_freed, cpu);
764
765                 for (i = 0; i < f->nr; i++) {
766                         ck = f->objs[i];
767                         list_add(&ck->list, &bc->freed);
768                 }
769         }
770
771         list_for_each_entry_safe(ck, n, &bc->freed, list) {
772                 cond_resched();
773
774                 bch2_journal_pin_drop(&c->journal, &ck->journal);
775                 bch2_journal_preres_put(&c->journal, &ck->res);
776
777                 list_del(&ck->list);
778                 kfree(ck->k);
779                 kmem_cache_free(bch2_key_cache, ck);
780         }
781
782         BUG_ON(atomic_long_read(&bc->nr_dirty) &&
783                !bch2_journal_error(&c->journal) &&
784                test_bit(BCH_FS_WAS_RW, &c->flags));
785         BUG_ON(atomic_long_read(&bc->nr_keys));
786
787         mutex_unlock(&bc->lock);
788
789         if (bc->table_init_done)
790                 rhashtable_destroy(&bc->table);
791
792         free_percpu(bc->pcpu_freed);
793 }
794
795 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
796 {
797         mutex_init(&c->lock);
798         INIT_LIST_HEAD(&c->freed);
799 }
800
801 static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
802 {
803         struct btree_key_cache *bc =
804                 container_of(shrink, struct btree_key_cache, shrink);
805
806         bch2_btree_key_cache_to_text(out, bc);
807 }
808
809 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
810 {
811         int ret;
812
813         c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
814         if (!c->pcpu_freed)
815                 return -ENOMEM;
816
817         ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
818         if (ret)
819                 return ret;
820
821         c->table_init_done = true;
822
823         c->shrink.seeks                 = 1;
824         c->shrink.count_objects         = bch2_btree_key_cache_count;
825         c->shrink.scan_objects          = bch2_btree_key_cache_scan;
826         c->shrink.to_text               = bch2_btree_key_cache_shrinker_to_text;
827         return register_shrinker(&c->shrink);
828 }
829
830 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
831 {
832         prt_printf(out, "nr_freed:\t%zu\n",     atomic_long_read(&c->nr_freed));
833         prt_printf(out, "nr_keys:\t%lu\n",      atomic_long_read(&c->nr_keys));
834         prt_printf(out, "nr_dirty:\t%lu\n",     atomic_long_read(&c->nr_dirty));
835 }
836
837 void bch2_btree_key_cache_exit(void)
838 {
839         if (bch2_key_cache)
840                 kmem_cache_destroy(bch2_key_cache);
841 }
842
843 int __init bch2_btree_key_cache_init(void)
844 {
845         bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
846         if (!bch2_key_cache)
847                 return -ENOMEM;
848
849         return 0;
850 }