]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_key_cache.c
Update bcachefs sources to 0a9f0fc68a bcachefs: Don't unconditially version_upgrade...
[bcachefs-tools-debian] / libbcachefs / btree_key_cache.c
1
2 #include "bcachefs.h"
3 #include "btree_cache.h"
4 #include "btree_iter.h"
5 #include "btree_key_cache.h"
6 #include "btree_locking.h"
7 #include "btree_update.h"
8 #include "error.h"
9 #include "journal.h"
10 #include "journal_reclaim.h"
11
12 #include <linux/sched/mm.h>
13 #include <trace/events/bcachefs.h>
14
15 static struct kmem_cache *bch2_key_cache;
16
17 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
18                                        const void *obj)
19 {
20         const struct bkey_cached *ck = obj;
21         const struct bkey_cached_key *key = arg->key;
22
23         return cmp_int(ck->key.btree_id, key->btree_id) ?:
24                 bkey_cmp(ck->key.pos, key->pos);
25 }
26
27 static const struct rhashtable_params bch2_btree_key_cache_params = {
28         .head_offset    = offsetof(struct bkey_cached, hash),
29         .key_offset     = offsetof(struct bkey_cached, key),
30         .key_len        = sizeof(struct bkey_cached_key),
31         .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
32 };
33
34 __flatten
35 inline struct bkey_cached *
36 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
37 {
38         struct bkey_cached_key key = {
39                 .btree_id       = btree_id,
40                 .pos            = pos,
41         };
42
43         return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
44                                       bch2_btree_key_cache_params);
45 }
46
47 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
48 {
49         if (!six_trylock_intent(&ck->c.lock))
50                 return false;
51
52         if (!six_trylock_write(&ck->c.lock)) {
53                 six_unlock_intent(&ck->c.lock);
54                 return false;
55         }
56
57         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
58                 six_unlock_write(&ck->c.lock);
59                 six_unlock_intent(&ck->c.lock);
60                 return false;
61         }
62
63         return true;
64 }
65
66 static void bkey_cached_evict(struct btree_key_cache *c,
67                               struct bkey_cached *ck)
68 {
69         BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
70                                       bch2_btree_key_cache_params));
71         memset(&ck->key, ~0, sizeof(ck->key));
72
73         c->nr_keys--;
74 }
75
76 static void bkey_cached_free(struct btree_key_cache *bc,
77                              struct bkey_cached *ck)
78 {
79         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
80
81         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
82
83         ck->btree_trans_barrier_seq =
84                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
85
86         list_move_tail(&ck->list, &bc->freed);
87         bc->nr_freed++;
88
89         kfree(ck->k);
90         ck->k           = NULL;
91         ck->u64s        = 0;
92
93         six_unlock_write(&ck->c.lock);
94         six_unlock_intent(&ck->c.lock);
95 }
96
97 static struct bkey_cached *
98 bkey_cached_alloc(struct btree_key_cache *c)
99 {
100         struct bkey_cached *ck;
101
102         list_for_each_entry_reverse(ck, &c->freed, list)
103                 if (bkey_cached_lock_for_evict(ck)) {
104                         c->nr_freed--;
105                         return ck;
106                 }
107
108         ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
109         if (likely(ck)) {
110                 INIT_LIST_HEAD(&ck->list);
111                 six_lock_init(&ck->c.lock);
112                 BUG_ON(!six_trylock_intent(&ck->c.lock));
113                 BUG_ON(!six_trylock_write(&ck->c.lock));
114                 return ck;
115         }
116
117         list_for_each_entry(ck, &c->clean, list)
118                 if (bkey_cached_lock_for_evict(ck)) {
119                         bkey_cached_evict(c, ck);
120                         return ck;
121                 }
122
123         return NULL;
124 }
125
126 static struct bkey_cached *
127 btree_key_cache_create(struct btree_key_cache *c,
128                        enum btree_id btree_id,
129                        struct bpos pos)
130 {
131         struct bkey_cached *ck;
132
133         ck = bkey_cached_alloc(c);
134         if (!ck)
135                 return ERR_PTR(-ENOMEM);
136
137         ck->c.level             = 0;
138         ck->c.btree_id          = btree_id;
139         ck->key.btree_id        = btree_id;
140         ck->key.pos             = pos;
141         ck->valid               = false;
142         ck->flags               = 1U << BKEY_CACHED_ACCESSED;
143
144         if (rhashtable_lookup_insert_fast(&c->table,
145                                           &ck->hash,
146                                           bch2_btree_key_cache_params)) {
147                 /* We raced with another fill: */
148                 bkey_cached_free(c, ck);
149                 return NULL;
150         }
151
152         c->nr_keys++;
153
154         list_move(&ck->list, &c->clean);
155         six_unlock_write(&ck->c.lock);
156
157         return ck;
158 }
159
160 static int btree_key_cache_fill(struct btree_trans *trans,
161                                 struct btree_iter *ck_iter,
162                                 struct bkey_cached *ck)
163 {
164         struct btree_iter *iter;
165         struct bkey_s_c k;
166         unsigned new_u64s = 0;
167         struct bkey_i *new_k = NULL;
168         int ret;
169
170         iter = bch2_trans_get_iter(trans, ck->key.btree_id,
171                                    ck->key.pos, BTREE_ITER_SLOTS);
172         k = bch2_btree_iter_peek_slot(iter);
173         ret = bkey_err(k);
174         if (ret)
175                 goto err;
176
177         if (!bch2_btree_node_relock(ck_iter, 0)) {
178                 trace_transaction_restart_ip(trans->ip, _THIS_IP_);
179                 ret = -EINTR;
180                 goto err;
181         }
182
183         if (k.k->u64s > ck->u64s) {
184                 new_u64s = roundup_pow_of_two(k.k->u64s);
185                 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
186                 if (!new_k) {
187                         ret = -ENOMEM;
188                         goto err;
189                 }
190         }
191
192         bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
193         if (new_k) {
194                 kfree(ck->k);
195                 ck->u64s = new_u64s;
196                 ck->k = new_k;
197         }
198
199         bkey_reassemble(ck->k, k);
200         ck->valid = true;
201         bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
202
203         /* We're not likely to need this iterator again: */
204         set_btree_iter_dontneed(trans, iter);
205 err:
206         bch2_trans_iter_put(trans, iter);
207         return ret;
208 }
209
210 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
211 {
212         struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
213         const struct btree_iter *iter = p;
214
215         return ck->key.btree_id == iter->btree_id &&
216                 !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
217 }
218
219 __flatten
220 int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
221 {
222         struct btree_trans *trans = iter->trans;
223         struct bch_fs *c = trans->c;
224         struct bkey_cached *ck;
225         int ret = 0;
226
227         BUG_ON(iter->level);
228
229         if (btree_node_locked(iter, 0)) {
230                 ck = (void *) iter->l[0].b;
231                 goto fill;
232         }
233 retry:
234         ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
235         if (!ck) {
236                 if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
237                         iter->l[0].b = NULL;
238                         return 0;
239                 }
240
241                 mutex_lock(&c->btree_key_cache.lock);
242                 ck = btree_key_cache_create(&c->btree_key_cache,
243                                             iter->btree_id, iter->pos);
244                 mutex_unlock(&c->btree_key_cache.lock);
245
246                 ret = PTR_ERR_OR_ZERO(ck);
247                 if (ret)
248                         goto err;
249                 if (!ck)
250                         goto retry;
251
252                 mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
253                 iter->locks_want = 1;
254         } else {
255                 enum six_lock_type lock_want = __btree_lock_want(iter, 0);
256
257                 if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
258                                      bkey_cached_check_fn, iter, _THIS_IP_)) {
259                         if (ck->key.btree_id != iter->btree_id ||
260                             bkey_cmp(ck->key.pos, iter->pos)) {
261                                 goto retry;
262                         }
263
264                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
265                         ret = -EINTR;
266                         goto err;
267                 }
268
269                 if (ck->key.btree_id != iter->btree_id ||
270                     bkey_cmp(ck->key.pos, iter->pos)) {
271                         six_unlock_type(&ck->c.lock, lock_want);
272                         goto retry;
273                 }
274
275                 mark_btree_node_locked(iter, 0, lock_want);
276         }
277
278         iter->l[0].lock_seq     = ck->c.lock.state.seq;
279         iter->l[0].b            = (void *) ck;
280 fill:
281         if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
282                 if (!btree_node_intent_locked(iter, 0))
283                         bch2_btree_iter_upgrade(iter, 1);
284                 if (!btree_node_intent_locked(iter, 0)) {
285                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
286                         ret = -EINTR;
287                         goto err;
288                 }
289
290                 ret = btree_key_cache_fill(trans, iter, ck);
291                 if (ret)
292                         goto err;
293         }
294
295         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
296                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
297
298         iter->uptodate = BTREE_ITER_NEED_PEEK;
299
300         if (!(iter->flags & BTREE_ITER_INTENT))
301                 bch2_btree_iter_downgrade(iter);
302         else if (!iter->locks_want) {
303                 if (!__bch2_btree_iter_upgrade(iter, 1))
304                         ret = -EINTR;
305         }
306
307         return ret;
308 err:
309         if (ret != -EINTR) {
310                 btree_node_unlock(iter, 0);
311                 iter->flags |= BTREE_ITER_ERROR;
312                 iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
313         }
314         return ret;
315 }
316
317 static int btree_key_cache_flush_pos(struct btree_trans *trans,
318                                      struct bkey_cached_key key,
319                                      u64 journal_seq,
320                                      bool evict)
321 {
322         struct bch_fs *c = trans->c;
323         struct journal *j = &c->journal;
324         struct btree_iter *c_iter = NULL, *b_iter = NULL;
325         struct bkey_cached *ck = NULL;
326         int ret;
327
328         b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
329                                      BTREE_ITER_SLOTS|
330                                      BTREE_ITER_INTENT);
331         c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
332                                      BTREE_ITER_CACHED|
333                                      BTREE_ITER_CACHED_NOFILL|
334                                      BTREE_ITER_CACHED_NOCREATE|
335                                      BTREE_ITER_INTENT);
336 retry:
337         ret = bch2_btree_iter_traverse(c_iter);
338         if (ret)
339                 goto err;
340
341         ck = (void *) c_iter->l[0].b;
342         if (!ck ||
343             (journal_seq && ck->journal.seq != journal_seq))
344                 goto out;
345
346         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
347                 if (!evict)
348                         goto out;
349                 goto evict;
350         }
351
352         ret   = bch2_btree_iter_traverse(b_iter) ?:
353                 bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
354                 bch2_trans_commit(trans, NULL, NULL,
355                                   BTREE_INSERT_NOUNLOCK|
356                                   BTREE_INSERT_NOCHECK_RW|
357                                   BTREE_INSERT_NOFAIL|
358                                   BTREE_INSERT_JOURNAL_RESERVED|
359                                   BTREE_INSERT_JOURNAL_RECLAIM);
360 err:
361         if (ret == -EINTR)
362                 goto retry;
363
364         if (ret) {
365                 bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
366                         "error flushing key cache: %i", ret);
367                 goto out;
368         }
369
370         bch2_journal_pin_drop(j, &ck->journal);
371         bch2_journal_preres_put(j, &ck->res);
372
373         if (!evict) {
374                 mutex_lock(&c->btree_key_cache.lock);
375                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
376                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
377                         c->btree_key_cache.nr_dirty--;
378                 }
379
380                 list_move_tail(&ck->list, &c->btree_key_cache.clean);
381                 mutex_unlock(&c->btree_key_cache.lock);
382         } else {
383 evict:
384                 BUG_ON(!btree_node_intent_locked(c_iter, 0));
385
386                 mark_btree_node_unlocked(c_iter, 0);
387                 c_iter->l[0].b = NULL;
388
389                 six_lock_write(&ck->c.lock, NULL, NULL);
390
391                 mutex_lock(&c->btree_key_cache.lock);
392                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
393                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
394                         c->btree_key_cache.nr_dirty--;
395                 }
396
397                 bkey_cached_evict(&c->btree_key_cache, ck);
398                 bkey_cached_free(&c->btree_key_cache, ck);
399                 mutex_unlock(&c->btree_key_cache.lock);
400         }
401 out:
402         bch2_trans_iter_put(trans, b_iter);
403         bch2_trans_iter_put(trans, c_iter);
404         return ret;
405 }
406
407 static void btree_key_cache_journal_flush(struct journal *j,
408                                           struct journal_entry_pin *pin,
409                                           u64 seq)
410 {
411         struct bch_fs *c = container_of(j, struct bch_fs, journal);
412         struct bkey_cached *ck =
413                 container_of(pin, struct bkey_cached, journal);
414         struct bkey_cached_key key;
415         struct btree_trans trans;
416
417         int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
418
419         six_lock_read(&ck->c.lock, NULL, NULL);
420         key = ck->key;
421
422         if (ck->journal.seq != seq ||
423             !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
424                 six_unlock_read(&ck->c.lock);
425                 goto unlock;
426         }
427         six_unlock_read(&ck->c.lock);
428
429         bch2_trans_init(&trans, c, 0, 0);
430         btree_key_cache_flush_pos(&trans, key, seq, false);
431         bch2_trans_exit(&trans);
432 unlock:
433         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
434 }
435
436 /*
437  * Flush and evict a key from the key cache:
438  */
439 int bch2_btree_key_cache_flush(struct btree_trans *trans,
440                                enum btree_id id, struct bpos pos)
441 {
442         struct bch_fs *c = trans->c;
443         struct bkey_cached_key key = { id, pos };
444
445         /* Fastpath - assume it won't be found: */
446         if (!bch2_btree_key_cache_find(c, id, pos))
447                 return 0;
448
449         return btree_key_cache_flush_pos(trans, key, 0, true);
450 }
451
452 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
453                                   struct btree_iter *iter,
454                                   struct bkey_i *insert)
455 {
456         struct bch_fs *c = trans->c;
457         struct bkey_cached *ck = (void *) iter->l[0].b;
458         bool kick_reclaim = false;
459
460         BUG_ON(insert->u64s > ck->u64s);
461
462         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
463                 int difference;
464
465                 BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
466
467                 difference = jset_u64s(insert->u64s) - ck->res.u64s;
468                 if (difference > 0) {
469                         trans->journal_preres.u64s      -= difference;
470                         ck->res.u64s                    += difference;
471                 }
472         }
473
474         bkey_copy(ck->k, insert);
475         ck->valid = true;
476
477         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
478                 mutex_lock(&c->btree_key_cache.lock);
479                 list_move(&ck->list, &c->btree_key_cache.dirty);
480
481                 set_bit(BKEY_CACHED_DIRTY, &ck->flags);
482                 c->btree_key_cache.nr_dirty++;
483
484                 if (bch2_nr_btree_keys_need_flush(c))
485                         kick_reclaim = true;
486
487                 mutex_unlock(&c->btree_key_cache.lock);
488         }
489
490         bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
491                                 &ck->journal, btree_key_cache_journal_flush);
492
493         if (kick_reclaim)
494                 journal_reclaim_kick(&c->journal);
495         return true;
496 }
497
498 #ifdef CONFIG_BCACHEFS_DEBUG
499 void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
500                                enum btree_id id, struct bpos pos)
501 {
502         BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
503 }
504 #endif
505
506 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
507                                            struct shrink_control *sc)
508 {
509         struct bch_fs *c = container_of(shrink, struct bch_fs,
510                                         btree_key_cache.shrink);
511         struct btree_key_cache *bc = &c->btree_key_cache;
512         struct bkey_cached *ck, *t;
513         size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
514         unsigned flags;
515
516         /* Return -1 if we can't do anything right now */
517         if (sc->gfp_mask & __GFP_FS)
518                 mutex_lock(&bc->lock);
519         else if (!mutex_trylock(&bc->lock))
520                 return -1;
521
522         flags = memalloc_nofs_save();
523
524         /*
525          * Newest freed entries are at the end of the list - once we hit one
526          * that's too new to be freed, we can bail out:
527          */
528         list_for_each_entry_safe(ck, t, &bc->freed, list) {
529                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
530                                                  ck->btree_trans_barrier_seq))
531                         break;
532
533                 list_del(&ck->list);
534                 kmem_cache_free(bch2_key_cache, ck);
535                 bc->nr_freed--;
536                 scanned++;
537                 freed++;
538         }
539
540         if (scanned >= nr)
541                 goto out;
542
543         list_for_each_entry_safe(ck, t, &bc->clean, list) {
544                 if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
545                         clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
546                 else if (bkey_cached_lock_for_evict(ck)) {
547                         bkey_cached_evict(bc, ck);
548                         bkey_cached_free(bc, ck);
549                 }
550
551                 scanned++;
552                 if (scanned >= nr) {
553                         if (&t->list != &bc->clean)
554                                 list_move_tail(&bc->clean, &t->list);
555                         goto out;
556                 }
557         }
558 out:
559         memalloc_nofs_restore(flags);
560         mutex_unlock(&bc->lock);
561
562         return freed;
563 }
564
565 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
566                                             struct shrink_control *sc)
567 {
568         struct bch_fs *c = container_of(shrink, struct bch_fs,
569                                         btree_key_cache.shrink);
570         struct btree_key_cache *bc = &c->btree_key_cache;
571
572         return bc->nr_keys;
573 }
574
575 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
576 {
577         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
578         struct bkey_cached *ck, *n;
579
580         if (bc->shrink.list.next)
581                 unregister_shrinker(&bc->shrink);
582
583         mutex_lock(&bc->lock);
584         list_splice(&bc->dirty, &bc->clean);
585
586         list_for_each_entry_safe(ck, n, &bc->clean, list) {
587                 cond_resched();
588
589                 bch2_journal_pin_drop(&c->journal, &ck->journal);
590                 bch2_journal_preres_put(&c->journal, &ck->res);
591
592                 kfree(ck->k);
593                 list_del(&ck->list);
594                 kmem_cache_free(bch2_key_cache, ck);
595                 bc->nr_keys--;
596         }
597
598         BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal));
599         BUG_ON(bc->nr_keys);
600
601         list_for_each_entry_safe(ck, n, &bc->freed, list) {
602                 cond_resched();
603
604                 list_del(&ck->list);
605                 kmem_cache_free(bch2_key_cache, ck);
606         }
607         mutex_unlock(&bc->lock);
608
609         if (bc->table_init_done)
610                 rhashtable_destroy(&bc->table);
611 }
612
613 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
614 {
615         mutex_init(&c->lock);
616         INIT_LIST_HEAD(&c->freed);
617         INIT_LIST_HEAD(&c->clean);
618         INIT_LIST_HEAD(&c->dirty);
619 }
620
621 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
622 {
623         int ret;
624
625         c->shrink.seeks                 = 1;
626         c->shrink.count_objects         = bch2_btree_key_cache_count;
627         c->shrink.scan_objects          = bch2_btree_key_cache_scan;
628
629         ret = register_shrinker(&c->shrink);
630         if (ret)
631                 return ret;
632
633         ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
634         if (ret)
635                 return ret;
636
637         c->table_init_done = true;
638         return 0;
639 }
640
641 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
642 {
643         pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
644         pr_buf(out, "nr_keys:\t%zu\n",  c->nr_keys);
645         pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty);
646 }
647
648 void bch2_btree_key_cache_exit(void)
649 {
650         if (bch2_key_cache)
651                 kmem_cache_destroy(bch2_key_cache);
652 }
653
654 int __init bch2_btree_key_cache_init(void)
655 {
656         bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
657         if (!bch2_key_cache)
658                 return -ENOMEM;
659
660         return 0;
661 }