]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_gc.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_io.h"
11 #include "btree_iter.h"
12 #include "btree_locking.h"
13 #include "buckets.h"
14 #include "extents.h"
15 #include "journal.h"
16 #include "journal_reclaim.h"
17 #include "keylist.h"
18 #include "replicas.h"
19 #include "super-io.h"
20
21 #include <linux/random.h>
22 #include <trace/events/bcachefs.h>
23
24 /* Debug code: */
25
26 /*
27  * Verify that child nodes correctly span parent node's range:
28  */
29 static void btree_node_interior_verify(struct btree *b)
30 {
31 #ifdef CONFIG_BCACHEFS_DEBUG
32         struct bpos next_node = b->data->min_key;
33         struct btree_node_iter iter;
34         struct bkey_s_c k;
35         struct bkey_s_c_btree_ptr_v2 bp;
36         struct bkey unpacked;
37
38         BUG_ON(!b->c.level);
39
40         bch2_btree_node_iter_init_from_start(&iter, b);
41
42         while (1) {
43                 k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
44                 if (k.k->type != KEY_TYPE_btree_ptr_v2)
45                         break;
46                 bp = bkey_s_c_to_btree_ptr_v2(k);
47
48                 BUG_ON(bkey_cmp(next_node, bp.v->min_key));
49
50                 bch2_btree_node_iter_advance(&iter, b);
51
52                 if (bch2_btree_node_iter_end(&iter)) {
53                         BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
54                         break;
55                 }
56
57                 next_node = bkey_successor(k.k->p);
58         }
59 #endif
60 }
61
62 /* Calculate ideal packed bkey format for new btree nodes: */
63
64 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
65 {
66         struct bkey_packed *k;
67         struct bset_tree *t;
68         struct bkey uk;
69
70         bch2_bkey_format_add_pos(s, b->data->min_key);
71
72         for_each_bset(b, t)
73                 bset_tree_for_each_key(b, t, k)
74                         if (!bkey_whiteout(k)) {
75                                 uk = bkey_unpack_key(b, k);
76                                 bch2_bkey_format_add_key(s, &uk);
77                         }
78 }
79
80 static struct bkey_format bch2_btree_calc_format(struct btree *b)
81 {
82         struct bkey_format_state s;
83
84         bch2_bkey_format_init(&s);
85         __bch2_btree_calc_format(&s, b);
86
87         return bch2_bkey_format_done(&s);
88 }
89
90 static size_t btree_node_u64s_with_format(struct btree *b,
91                                           struct bkey_format *new_f)
92 {
93         struct bkey_format *old_f = &b->format;
94
95         /* stupid integer promotion rules */
96         ssize_t delta =
97             (((int) new_f->key_u64s - old_f->key_u64s) *
98              (int) b->nr.packed_keys) +
99             (((int) new_f->key_u64s - BKEY_U64s) *
100              (int) b->nr.unpacked_keys);
101
102         BUG_ON(delta + b->nr.live_u64s < 0);
103
104         return b->nr.live_u64s + delta;
105 }
106
107 /**
108  * btree_node_format_fits - check if we could rewrite node with a new format
109  *
110  * This assumes all keys can pack with the new format -- it just checks if
111  * the re-packed keys would fit inside the node itself.
112  */
113 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
114                                  struct bkey_format *new_f)
115 {
116         size_t u64s = btree_node_u64s_with_format(b, new_f);
117
118         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
119 }
120
121 /* Btree node freeing/allocation: */
122
123 static void __btree_node_free(struct bch_fs *c, struct btree *b)
124 {
125         trace_btree_node_free(c, b);
126
127         BUG_ON(btree_node_dirty(b));
128         BUG_ON(btree_node_need_write(b));
129         BUG_ON(b == btree_node_root(c, b));
130         BUG_ON(b->ob.nr);
131         BUG_ON(!list_empty(&b->write_blocked));
132         BUG_ON(b->will_make_reachable);
133
134         clear_btree_node_noevict(b);
135
136         bch2_btree_node_hash_remove(&c->btree_cache, b);
137
138         six_lock_wakeup_all(&b->c.lock);
139
140         mutex_lock(&c->btree_cache.lock);
141         list_move(&b->list, &c->btree_cache.freeable);
142         mutex_unlock(&c->btree_cache.lock);
143 }
144
145 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
146 {
147         struct open_buckets ob = b->ob;
148
149         b->ob.nr = 0;
150
151         clear_btree_node_dirty(b);
152
153         btree_node_lock_type(c, b, SIX_LOCK_write);
154         __btree_node_free(c, b);
155         six_unlock_write(&b->c.lock);
156
157         bch2_open_buckets_put(c, &ob);
158 }
159
160 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
161                                 struct btree_iter *iter)
162 {
163         struct btree_iter *linked;
164
165         trans_for_each_iter(iter->trans, linked)
166                 BUG_ON(linked->l[b->c.level].b == b);
167
168         six_lock_write(&b->c.lock, NULL, NULL);
169         __btree_node_free(c, b);
170         six_unlock_write(&b->c.lock);
171         six_unlock_intent(&b->c.lock);
172 }
173
174 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
175                                              struct disk_reservation *res,
176                                              struct closure *cl,
177                                              unsigned flags)
178 {
179         struct write_point *wp;
180         struct btree *b;
181         BKEY_PADDED(k) tmp;
182         struct open_buckets ob = { .nr = 0 };
183         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
184         unsigned nr_reserve;
185         enum alloc_reserve alloc_reserve;
186
187         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
188                 nr_reserve      = 0;
189                 alloc_reserve   = RESERVE_ALLOC;
190         } else if (flags & BTREE_INSERT_USE_RESERVE) {
191                 nr_reserve      = BTREE_NODE_RESERVE / 2;
192                 alloc_reserve   = RESERVE_BTREE;
193         } else {
194                 nr_reserve      = BTREE_NODE_RESERVE;
195                 alloc_reserve   = RESERVE_NONE;
196         }
197
198         mutex_lock(&c->btree_reserve_cache_lock);
199         if (c->btree_reserve_cache_nr > nr_reserve) {
200                 struct btree_alloc *a =
201                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
202
203                 ob = a->ob;
204                 bkey_copy(&tmp.k, &a->k);
205                 mutex_unlock(&c->btree_reserve_cache_lock);
206                 goto mem_alloc;
207         }
208         mutex_unlock(&c->btree_reserve_cache_lock);
209
210 retry:
211         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
212                                       writepoint_ptr(&c->btree_write_point),
213                                       &devs_have,
214                                       res->nr_replicas,
215                                       c->opts.metadata_replicas_required,
216                                       alloc_reserve, 0, cl);
217         if (IS_ERR(wp))
218                 return ERR_CAST(wp);
219
220         if (wp->sectors_free < c->opts.btree_node_size) {
221                 struct open_bucket *ob;
222                 unsigned i;
223
224                 open_bucket_for_each(c, &wp->ptrs, ob, i)
225                         if (ob->sectors_free < c->opts.btree_node_size)
226                                 ob->sectors_free = 0;
227
228                 bch2_alloc_sectors_done(c, wp);
229                 goto retry;
230         }
231
232         if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
233                 bkey_btree_ptr_v2_init(&tmp.k);
234         else
235                 bkey_btree_ptr_init(&tmp.k);
236
237         bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
238
239         bch2_open_bucket_get(c, wp, &ob);
240         bch2_alloc_sectors_done(c, wp);
241 mem_alloc:
242         b = bch2_btree_node_mem_alloc(c);
243
244         /* we hold cannibalize_lock: */
245         BUG_ON(IS_ERR(b));
246         BUG_ON(b->ob.nr);
247
248         bkey_copy(&b->key, &tmp.k);
249         b->ob = ob;
250
251         return b;
252 }
253
254 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
255 {
256         struct bch_fs *c = as->c;
257         struct btree *b;
258         int ret;
259
260         BUG_ON(level >= BTREE_MAX_DEPTH);
261         BUG_ON(!as->nr_prealloc_nodes);
262
263         b = as->prealloc_nodes[--as->nr_prealloc_nodes];
264
265         set_btree_node_accessed(b);
266         set_btree_node_dirty(b);
267         set_btree_node_need_write(b);
268
269         bch2_bset_init_first(b, &b->data->keys);
270         b->c.level      = level;
271         b->c.btree_id   = as->btree_id;
272
273         memset(&b->nr, 0, sizeof(b->nr));
274         b->data->magic = cpu_to_le64(bset_magic(c));
275         b->data->flags = 0;
276         SET_BTREE_NODE_ID(b->data, as->btree_id);
277         SET_BTREE_NODE_LEVEL(b->data, level);
278         b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
279
280         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
281                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
282
283                 bp->v.mem_ptr           = 0;
284                 bp->v.seq               = b->data->keys.seq;
285                 bp->v.sectors_written   = 0;
286                 bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
287         }
288
289         if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
290                 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
291
292         if (btree_node_is_extents(b) &&
293             !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
294                 set_btree_node_old_extent_overwrite(b);
295
296         bch2_btree_build_aux_trees(b);
297
298         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
299         BUG_ON(ret);
300
301         trace_btree_node_alloc(c, b);
302         return b;
303 }
304
305 static void btree_set_min(struct btree *b, struct bpos pos)
306 {
307         if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
308                 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
309         b->data->min_key = pos;
310 }
311
312 static void btree_set_max(struct btree *b, struct bpos pos)
313 {
314         b->key.k.p = pos;
315         b->data->max_key = pos;
316 }
317
318 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
319                                                   struct btree *b,
320                                                   struct bkey_format format)
321 {
322         struct btree *n;
323
324         n = bch2_btree_node_alloc(as, b->c.level);
325
326         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
327
328         btree_set_min(n, b->data->min_key);
329         btree_set_max(n, b->data->max_key);
330
331         n->data->format         = format;
332         btree_node_set_format(n, format);
333
334         bch2_btree_sort_into(as->c, n, b);
335
336         btree_node_reset_sib_u64s(n);
337
338         n->key.k.p = b->key.k.p;
339         return n;
340 }
341
342 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
343                                                        struct btree *b)
344 {
345         struct bkey_format new_f = bch2_btree_calc_format(b);
346
347         /*
348          * The keys might expand with the new format - if they wouldn't fit in
349          * the btree node anymore, use the old format for now:
350          */
351         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
352                 new_f = b->format;
353
354         return __bch2_btree_node_alloc_replacement(as, b, new_f);
355 }
356
357 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
358 {
359         struct btree *b = bch2_btree_node_alloc(as, level);
360
361         btree_set_min(b, POS_MIN);
362         btree_set_max(b, POS_MAX);
363         b->data->format = bch2_btree_calc_format(b);
364
365         btree_node_set_format(b, b->data->format);
366         bch2_btree_build_aux_trees(b);
367
368         bch2_btree_update_add_new_node(as, b);
369         six_unlock_write(&b->c.lock);
370
371         return b;
372 }
373
374 static void bch2_btree_reserve_put(struct btree_update *as)
375 {
376         struct bch_fs *c = as->c;
377
378         mutex_lock(&c->btree_reserve_cache_lock);
379
380         while (as->nr_prealloc_nodes) {
381                 struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
382
383                 six_unlock_write(&b->c.lock);
384
385                 if (c->btree_reserve_cache_nr <
386                     ARRAY_SIZE(c->btree_reserve_cache)) {
387                         struct btree_alloc *a =
388                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
389
390                         a->ob = b->ob;
391                         b->ob.nr = 0;
392                         bkey_copy(&a->k, &b->key);
393                 } else {
394                         bch2_open_buckets_put(c, &b->ob);
395                 }
396
397                 btree_node_lock_type(c, b, SIX_LOCK_write);
398                 __btree_node_free(c, b);
399                 six_unlock_write(&b->c.lock);
400
401                 six_unlock_intent(&b->c.lock);
402         }
403
404         mutex_unlock(&c->btree_reserve_cache_lock);
405 }
406
407 static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
408                                   unsigned flags, struct closure *cl)
409 {
410         struct bch_fs *c = as->c;
411         struct btree *b;
412         int ret;
413
414         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
415
416         /*
417          * Protects reaping from the btree node cache and using the btree node
418          * open bucket reserve:
419          */
420         ret = bch2_btree_cache_cannibalize_lock(c, cl);
421         if (ret)
422                 return ret;
423
424         while (as->nr_prealloc_nodes < nr_nodes) {
425                 b = __bch2_btree_node_alloc(c, &as->disk_res,
426                                             flags & BTREE_INSERT_NOWAIT
427                                             ? NULL : cl, flags);
428                 if (IS_ERR(b)) {
429                         ret = PTR_ERR(b);
430                         goto err_free;
431                 }
432
433                 ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
434                 if (ret)
435                         goto err_free;
436
437                 as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
438         }
439
440         bch2_btree_cache_cannibalize_unlock(c);
441         return 0;
442 err_free:
443         bch2_btree_cache_cannibalize_unlock(c);
444         trace_btree_reserve_get_fail(c, nr_nodes, cl);
445         return ret;
446 }
447
448 /* Asynchronous interior node update machinery */
449
450 static void bch2_btree_update_free(struct btree_update *as)
451 {
452         struct bch_fs *c = as->c;
453
454         bch2_journal_preres_put(&c->journal, &as->journal_preres);
455
456         bch2_journal_pin_drop(&c->journal, &as->journal);
457         bch2_journal_pin_flush(&c->journal, &as->journal);
458         bch2_disk_reservation_put(c, &as->disk_res);
459         bch2_btree_reserve_put(as);
460
461         mutex_lock(&c->btree_interior_update_lock);
462         list_del(&as->unwritten_list);
463         list_del(&as->list);
464         mutex_unlock(&c->btree_interior_update_lock);
465
466         closure_debug_destroy(&as->cl);
467         mempool_free(as, &c->btree_interior_update_pool);
468
469         closure_wake_up(&c->btree_interior_update_wait);
470 }
471
472 static void btree_update_will_delete_key(struct btree_update *as,
473                                          struct bkey_i *k)
474 {
475         BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
476                ARRAY_SIZE(as->_old_keys));
477         bch2_keylist_add(&as->old_keys, k);
478 }
479
480 static void btree_update_will_add_key(struct btree_update *as,
481                                       struct bkey_i *k)
482 {
483         BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
484                ARRAY_SIZE(as->_new_keys));
485         bch2_keylist_add(&as->new_keys, k);
486 }
487
488 /*
489  * The transactional part of an interior btree node update, where we journal the
490  * update we did to the interior node and update alloc info:
491  */
492 static int btree_update_nodes_written_trans(struct btree_trans *trans,
493                                             struct btree_update *as)
494 {
495         struct bkey_i *k;
496         int ret;
497
498         trans->extra_journal_entries = (void *) &as->journal_entries[0];
499         trans->extra_journal_entry_u64s = as->journal_u64s;
500         trans->journal_pin = &as->journal;
501
502         for_each_keylist_key(&as->new_keys, k) {
503                 ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
504                                           0, 0, BTREE_TRIGGER_INSERT);
505                 if (ret)
506                         return ret;
507         }
508
509         for_each_keylist_key(&as->old_keys, k) {
510                 ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
511                                           0, 0, BTREE_TRIGGER_OVERWRITE);
512                 if (ret)
513                         return ret;
514         }
515
516         return 0;
517 }
518
519 static void btree_update_nodes_written(struct btree_update *as)
520 {
521         struct bch_fs *c = as->c;
522         struct btree *b = as->b;
523         u64 journal_seq = 0;
524         unsigned i;
525         int ret;
526
527         /*
528          * We did an update to a parent node where the pointers we added pointed
529          * to child nodes that weren't written yet: now, the child nodes have
530          * been written so we can write out the update to the interior node.
531          */
532
533         /*
534          * We can't call into journal reclaim here: we'd block on the journal
535          * reclaim lock, but we may need to release the open buckets we have
536          * pinned in order for other btree updates to make forward progress, and
537          * journal reclaim does btree updates when flushing bkey_cached entries,
538          * which may require allocations as well.
539          */
540         ret = bch2_trans_do(c, &as->disk_res, &journal_seq,
541                             BTREE_INSERT_NOFAIL|
542                             BTREE_INSERT_USE_RESERVE|
543                             BTREE_INSERT_USE_ALLOC_RESERVE|
544                             BTREE_INSERT_NOCHECK_RW|
545                             BTREE_INSERT_JOURNAL_RECLAIM|
546                             BTREE_INSERT_JOURNAL_RESERVED,
547                             btree_update_nodes_written_trans(&trans, as));
548         BUG_ON(ret && !bch2_journal_error(&c->journal));
549
550         if (b) {
551                 /*
552                  * @b is the node we did the final insert into:
553                  *
554                  * On failure to get a journal reservation, we still have to
555                  * unblock the write and allow most of the write path to happen
556                  * so that shutdown works, but the i->journal_seq mechanism
557                  * won't work to prevent the btree write from being visible (we
558                  * didn't get a journal sequence number) - instead
559                  * __bch2_btree_node_write() doesn't do the actual write if
560                  * we're in journal error state:
561                  */
562
563                 btree_node_lock_type(c, b, SIX_LOCK_intent);
564                 btree_node_lock_type(c, b, SIX_LOCK_write);
565                 mutex_lock(&c->btree_interior_update_lock);
566
567                 list_del(&as->write_blocked_list);
568
569                 if (!ret && as->b == b) {
570                         struct bset *i = btree_bset_last(b);
571
572                         BUG_ON(!b->c.level);
573                         BUG_ON(!btree_node_dirty(b));
574
575                         i->journal_seq = cpu_to_le64(
576                                 max(journal_seq,
577                                     le64_to_cpu(i->journal_seq)));
578
579                         bch2_btree_add_journal_pin(c, b, journal_seq);
580                 }
581
582                 mutex_unlock(&c->btree_interior_update_lock);
583                 six_unlock_write(&b->c.lock);
584
585                 btree_node_write_if_need(c, b, SIX_LOCK_intent);
586                 six_unlock_intent(&b->c.lock);
587         }
588
589         bch2_journal_pin_drop(&c->journal, &as->journal);
590
591         bch2_journal_preres_put(&c->journal, &as->journal_preres);
592
593         mutex_lock(&c->btree_interior_update_lock);
594         for (i = 0; i < as->nr_new_nodes; i++) {
595                 b = as->new_nodes[i];
596
597                 BUG_ON(b->will_make_reachable != (unsigned long) as);
598                 b->will_make_reachable = 0;
599         }
600         mutex_unlock(&c->btree_interior_update_lock);
601
602         for (i = 0; i < as->nr_new_nodes; i++) {
603                 b = as->new_nodes[i];
604
605                 btree_node_lock_type(c, b, SIX_LOCK_read);
606                 btree_node_write_if_need(c, b, SIX_LOCK_read);
607                 six_unlock_read(&b->c.lock);
608         }
609
610         for (i = 0; i < as->nr_open_buckets; i++)
611                 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
612
613         bch2_btree_update_free(as);
614 }
615
616 static void btree_interior_update_work(struct work_struct *work)
617 {
618         struct bch_fs *c =
619                 container_of(work, struct bch_fs, btree_interior_update_work);
620         struct btree_update *as;
621
622         while (1) {
623                 mutex_lock(&c->btree_interior_update_lock);
624                 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
625                                               struct btree_update, unwritten_list);
626                 if (as && !as->nodes_written)
627                         as = NULL;
628                 mutex_unlock(&c->btree_interior_update_lock);
629
630                 if (!as)
631                         break;
632
633                 btree_update_nodes_written(as);
634         }
635 }
636
637 static void btree_update_set_nodes_written(struct closure *cl)
638 {
639         struct btree_update *as = container_of(cl, struct btree_update, cl);
640         struct bch_fs *c = as->c;
641
642         mutex_lock(&c->btree_interior_update_lock);
643         as->nodes_written = true;
644         mutex_unlock(&c->btree_interior_update_lock);
645
646         queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
647 }
648
649 /*
650  * We're updating @b with pointers to nodes that haven't finished writing yet:
651  * block @b from being written until @as completes
652  */
653 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
654 {
655         struct bch_fs *c = as->c;
656
657         mutex_lock(&c->btree_interior_update_lock);
658         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
659
660         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
661         BUG_ON(!btree_node_dirty(b));
662
663         as->mode        = BTREE_INTERIOR_UPDATING_NODE;
664         as->b           = b;
665         list_add(&as->write_blocked_list, &b->write_blocked);
666
667         mutex_unlock(&c->btree_interior_update_lock);
668 }
669
670 static void btree_update_reparent(struct btree_update *as,
671                                   struct btree_update *child)
672 {
673         struct bch_fs *c = as->c;
674
675         lockdep_assert_held(&c->btree_interior_update_lock);
676
677         child->b = NULL;
678         child->mode = BTREE_INTERIOR_UPDATING_AS;
679
680         /*
681          * When we write a new btree root, we have to drop our journal pin
682          * _before_ the new nodes are technically reachable; see
683          * btree_update_nodes_written().
684          *
685          * This goes for journal pins that are recursively blocked on us - so,
686          * just transfer the journal pin to the new interior update so
687          * btree_update_nodes_written() can drop it.
688          */
689         bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
690         bch2_journal_pin_drop(&c->journal, &child->journal);
691 }
692
693 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
694 {
695         struct bkey_i *insert = &b->key;
696         struct bch_fs *c = as->c;
697
698         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
699
700         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
701                ARRAY_SIZE(as->journal_entries));
702
703         as->journal_u64s +=
704                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
705                                   BCH_JSET_ENTRY_btree_root,
706                                   b->c.btree_id, b->c.level,
707                                   insert, insert->k.u64s);
708
709         mutex_lock(&c->btree_interior_update_lock);
710         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
711
712         as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
713         mutex_unlock(&c->btree_interior_update_lock);
714 }
715
716 /*
717  * bch2_btree_update_add_new_node:
718  *
719  * This causes @as to wait on @b to be written, before it gets to
720  * bch2_btree_update_nodes_written
721  *
722  * Additionally, it sets b->will_make_reachable to prevent any additional writes
723  * to @b from happening besides the first until @b is reachable on disk
724  *
725  * And it adds @b to the list of @as's new nodes, so that we can update sector
726  * counts in bch2_btree_update_nodes_written:
727  */
728 void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
729 {
730         struct bch_fs *c = as->c;
731
732         closure_get(&as->cl);
733
734         mutex_lock(&c->btree_interior_update_lock);
735         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
736         BUG_ON(b->will_make_reachable);
737
738         as->new_nodes[as->nr_new_nodes++] = b;
739         b->will_make_reachable = 1UL|(unsigned long) as;
740
741         mutex_unlock(&c->btree_interior_update_lock);
742
743         btree_update_will_add_key(as, &b->key);
744 }
745
746 /*
747  * returns true if @b was a new node
748  */
749 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
750 {
751         struct btree_update *as;
752         unsigned long v;
753         unsigned i;
754
755         mutex_lock(&c->btree_interior_update_lock);
756         /*
757          * When b->will_make_reachable != 0, it owns a ref on as->cl that's
758          * dropped when it gets written by bch2_btree_complete_write - the
759          * xchg() is for synchronization with bch2_btree_complete_write:
760          */
761         v = xchg(&b->will_make_reachable, 0);
762         as = (struct btree_update *) (v & ~1UL);
763
764         if (!as) {
765                 mutex_unlock(&c->btree_interior_update_lock);
766                 return;
767         }
768
769         for (i = 0; i < as->nr_new_nodes; i++)
770                 if (as->new_nodes[i] == b)
771                         goto found;
772
773         BUG();
774 found:
775         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
776         mutex_unlock(&c->btree_interior_update_lock);
777
778         if (v & 1)
779                 closure_put(&as->cl);
780 }
781
782 void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
783 {
784         while (b->ob.nr)
785                 as->open_buckets[as->nr_open_buckets++] =
786                         b->ob.v[--b->ob.nr];
787 }
788
789 /*
790  * @b is being split/rewritten: it may have pointers to not-yet-written btree
791  * nodes and thus outstanding btree_updates - redirect @b's
792  * btree_updates to point to this btree_update:
793  */
794 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
795                                                struct btree *b)
796 {
797         struct bch_fs *c = as->c;
798         struct btree_update *p, *n;
799         struct btree_write *w;
800
801         set_btree_node_dying(b);
802
803         if (btree_node_fake(b))
804                 return;
805
806         mutex_lock(&c->btree_interior_update_lock);
807
808         /*
809          * Does this node have any btree_update operations preventing
810          * it from being written?
811          *
812          * If so, redirect them to point to this btree_update: we can
813          * write out our new nodes, but we won't make them visible until those
814          * operations complete
815          */
816         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
817                 list_del_init(&p->write_blocked_list);
818                 btree_update_reparent(as, p);
819
820                 /*
821                  * for flush_held_btree_writes() waiting on updates to flush or
822                  * nodes to be writeable:
823                  */
824                 closure_wake_up(&c->btree_interior_update_wait);
825         }
826
827         clear_btree_node_dirty(b);
828         clear_btree_node_need_write(b);
829
830         /*
831          * Does this node have unwritten data that has a pin on the journal?
832          *
833          * If so, transfer that pin to the btree_update operation -
834          * note that if we're freeing multiple nodes, we only need to keep the
835          * oldest pin of any of the nodes we're freeing. We'll release the pin
836          * when the new nodes are persistent and reachable on disk:
837          */
838         w = btree_current_write(b);
839         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
840         bch2_journal_pin_drop(&c->journal, &w->journal);
841
842         w = btree_prev_write(b);
843         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
844         bch2_journal_pin_drop(&c->journal, &w->journal);
845
846         mutex_unlock(&c->btree_interior_update_lock);
847
848         /*
849          * Is this a node that isn't reachable on disk yet?
850          *
851          * Nodes that aren't reachable yet have writes blocked until they're
852          * reachable - now that we've cancelled any pending writes and moved
853          * things waiting on that write to wait on this update, we can drop this
854          * node from the list of nodes that the other update is making
855          * reachable, prior to freeing it:
856          */
857         btree_update_drop_new_node(c, b);
858
859         btree_update_will_delete_key(as, &b->key);
860 }
861
862 void bch2_btree_update_done(struct btree_update *as)
863 {
864         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
865
866         bch2_btree_reserve_put(as);
867
868         continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
869 }
870
871 struct btree_update *
872 bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
873                         unsigned nr_nodes, unsigned flags,
874                         struct closure *cl)
875 {
876         struct bch_fs *c = trans->c;
877         struct btree_update *as;
878         int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
879                 ? BCH_DISK_RESERVATION_NOFAIL : 0;
880         int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
881                 ? JOURNAL_RES_GET_RECLAIM : 0;
882         int ret = 0;
883
884         /*
885          * This check isn't necessary for correctness - it's just to potentially
886          * prevent us from doing a lot of work that'll end up being wasted:
887          */
888         ret = bch2_journal_error(&c->journal);
889         if (ret)
890                 return ERR_PTR(ret);
891
892         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
893         memset(as, 0, sizeof(*as));
894         closure_init(&as->cl, NULL);
895         as->c           = c;
896         as->mode        = BTREE_INTERIOR_NO_UPDATE;
897         as->btree_id    = id;
898         INIT_LIST_HEAD(&as->list);
899         INIT_LIST_HEAD(&as->unwritten_list);
900         INIT_LIST_HEAD(&as->write_blocked_list);
901         bch2_keylist_init(&as->old_keys, as->_old_keys);
902         bch2_keylist_init(&as->new_keys, as->_new_keys);
903         bch2_keylist_init(&as->parent_keys, as->inline_keys);
904
905         ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
906                                       BTREE_UPDATE_JOURNAL_RES,
907                                       journal_flags|JOURNAL_RES_GET_NONBLOCK);
908         if (ret == -EAGAIN) {
909                 if (flags & BTREE_INSERT_NOUNLOCK)
910                         return ERR_PTR(-EINTR);
911
912                 bch2_trans_unlock(trans);
913
914                 ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
915                                 BTREE_UPDATE_JOURNAL_RES,
916                                 journal_flags);
917                 if (ret)
918                         return ERR_PTR(ret);
919
920                 if (!bch2_trans_relock(trans)) {
921                         ret = -EINTR;
922                         goto err;
923                 }
924         }
925
926         ret = bch2_disk_reservation_get(c, &as->disk_res,
927                         nr_nodes * c->opts.btree_node_size,
928                         c->opts.metadata_replicas,
929                         disk_res_flags);
930         if (ret)
931                 goto err;
932
933         ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
934         if (ret)
935                 goto err;
936
937         mutex_lock(&c->btree_interior_update_lock);
938         list_add_tail(&as->list, &c->btree_interior_update_list);
939         mutex_unlock(&c->btree_interior_update_lock);
940
941         return as;
942 err:
943         bch2_btree_update_free(as);
944         return ERR_PTR(ret);
945 }
946
947 /* Btree root updates: */
948
949 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
950 {
951         /* Root nodes cannot be reaped */
952         mutex_lock(&c->btree_cache.lock);
953         list_del_init(&b->list);
954         mutex_unlock(&c->btree_cache.lock);
955
956         mutex_lock(&c->btree_root_lock);
957         BUG_ON(btree_node_root(c, b) &&
958                (b->c.level < btree_node_root(c, b)->c.level ||
959                 !btree_node_dying(btree_node_root(c, b))));
960
961         btree_node_root(c, b) = b;
962         mutex_unlock(&c->btree_root_lock);
963
964         bch2_recalc_btree_reserve(c);
965 }
966
967 /**
968  * bch_btree_set_root - update the root in memory and on disk
969  *
970  * To ensure forward progress, the current task must not be holding any
971  * btree node write locks. However, you must hold an intent lock on the
972  * old root.
973  *
974  * Note: This allocates a journal entry but doesn't add any keys to
975  * it.  All the btree roots are part of every journal write, so there
976  * is nothing new to be done.  This just guarantees that there is a
977  * journal write.
978  */
979 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
980                                 struct btree_iter *iter)
981 {
982         struct bch_fs *c = as->c;
983         struct btree *old;
984
985         trace_btree_set_root(c, b);
986         BUG_ON(!b->written &&
987                !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
988
989         old = btree_node_root(c, b);
990
991         /*
992          * Ensure no one is using the old root while we switch to the
993          * new root:
994          */
995         bch2_btree_node_lock_write(old, iter);
996
997         bch2_btree_set_root_inmem(c, b);
998
999         btree_update_updated_root(as, b);
1000
1001         /*
1002          * Unlock old root after new root is visible:
1003          *
1004          * The new root isn't persistent, but that's ok: we still have
1005          * an intent lock on the new root, and any updates that would
1006          * depend on the new root would have to update the new root.
1007          */
1008         bch2_btree_node_unlock_write(old, iter);
1009 }
1010
1011 /* Interior node updates: */
1012
1013 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1014                                         struct btree_iter *iter,
1015                                         struct bkey_i *insert,
1016                                         struct btree_node_iter *node_iter)
1017 {
1018         struct bkey_packed *k;
1019
1020         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
1021                ARRAY_SIZE(as->journal_entries));
1022
1023         as->journal_u64s +=
1024                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
1025                                   BCH_JSET_ENTRY_btree_keys,
1026                                   b->c.btree_id, b->c.level,
1027                                   insert, insert->k.u64s);
1028
1029         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1030                bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1031                 bch2_btree_node_iter_advance(node_iter, b);
1032
1033         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1034         set_btree_node_dirty(b);
1035         set_btree_node_need_write(b);
1036 }
1037
1038 /*
1039  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1040  * node)
1041  */
1042 static struct btree *__btree_split_node(struct btree_update *as,
1043                                         struct btree *n1,
1044                                         struct btree_iter *iter)
1045 {
1046         size_t nr_packed = 0, nr_unpacked = 0;
1047         struct btree *n2;
1048         struct bset *set1, *set2;
1049         struct bkey_packed *k, *prev = NULL;
1050
1051         n2 = bch2_btree_node_alloc(as, n1->c.level);
1052         bch2_btree_update_add_new_node(as, n2);
1053
1054         n2->data->max_key       = n1->data->max_key;
1055         n2->data->format        = n1->format;
1056         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1057         n2->key.k.p = n1->key.k.p;
1058
1059         btree_node_set_format(n2, n2->data->format);
1060
1061         set1 = btree_bset_first(n1);
1062         set2 = btree_bset_first(n2);
1063
1064         /*
1065          * Has to be a linear search because we don't have an auxiliary
1066          * search tree yet
1067          */
1068         k = set1->start;
1069         while (1) {
1070                 struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
1071
1072                 if (n == vstruct_last(set1))
1073                         break;
1074                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1075                         break;
1076
1077                 if (bkey_packed(k))
1078                         nr_packed++;
1079                 else
1080                         nr_unpacked++;
1081
1082                 prev = k;
1083                 k = n;
1084         }
1085
1086         BUG_ON(!prev);
1087
1088         btree_set_max(n1, bkey_unpack_pos(n1, prev));
1089         btree_set_min(n2, bkey_successor(n1->key.k.p));
1090
1091         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1092         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1093
1094         set_btree_bset_end(n1, n1->set);
1095         set_btree_bset_end(n2, n2->set);
1096
1097         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1098         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1099         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1100         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1101
1102         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1103         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1104         n1->nr.packed_keys      = nr_packed;
1105         n1->nr.unpacked_keys    = nr_unpacked;
1106
1107         BUG_ON(!set1->u64s);
1108         BUG_ON(!set2->u64s);
1109
1110         memcpy_u64s(set2->start,
1111                     vstruct_end(set1),
1112                     le16_to_cpu(set2->u64s));
1113
1114         btree_node_reset_sib_u64s(n1);
1115         btree_node_reset_sib_u64s(n2);
1116
1117         bch2_verify_btree_nr_keys(n1);
1118         bch2_verify_btree_nr_keys(n2);
1119
1120         if (n1->c.level) {
1121                 btree_node_interior_verify(n1);
1122                 btree_node_interior_verify(n2);
1123         }
1124
1125         return n2;
1126 }
1127
1128 /*
1129  * For updates to interior nodes, we've got to do the insert before we split
1130  * because the stuff we're inserting has to be inserted atomically. Post split,
1131  * the keys might have to go in different nodes and the split would no longer be
1132  * atomic.
1133  *
1134  * Worse, if the insert is from btree node coalescing, if we do the insert after
1135  * we do the split (and pick the pivot) - the pivot we pick might be between
1136  * nodes that were coalesced, and thus in the middle of a child node post
1137  * coalescing:
1138  */
1139 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1140                                     struct btree_iter *iter,
1141                                     struct keylist *keys)
1142 {
1143         struct btree_node_iter node_iter;
1144         struct bkey_i *k = bch2_keylist_front(keys);
1145         struct bkey_packed *src, *dst, *n;
1146         struct bset *i;
1147
1148         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1149
1150         bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1151
1152         while (!bch2_keylist_empty(keys)) {
1153                 k = bch2_keylist_front(keys);
1154
1155                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1156                 bch2_keylist_pop_front(keys);
1157         }
1158
1159         /*
1160          * We can't tolerate whiteouts here - with whiteouts there can be
1161          * duplicate keys, and it would be rather bad if we picked a duplicate
1162          * for the pivot:
1163          */
1164         i = btree_bset_first(b);
1165         src = dst = i->start;
1166         while (src != vstruct_last(i)) {
1167                 n = bkey_next_skip_noops(src, vstruct_last(i));
1168                 if (!bkey_deleted(src)) {
1169                         memmove_u64s_down(dst, src, src->u64s);
1170                         dst = bkey_next(dst);
1171                 }
1172                 src = n;
1173         }
1174
1175         i->u64s = cpu_to_le16((u64 *) dst - i->_data);
1176         set_btree_bset_end(b, b->set);
1177
1178         BUG_ON(b->nsets != 1 ||
1179                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1180
1181         btree_node_interior_verify(b);
1182 }
1183
1184 static void btree_split(struct btree_update *as, struct btree *b,
1185                         struct btree_iter *iter, struct keylist *keys,
1186                         unsigned flags)
1187 {
1188         struct bch_fs *c = as->c;
1189         struct btree *parent = btree_node_parent(iter, b);
1190         struct btree *n1, *n2 = NULL, *n3 = NULL;
1191         u64 start_time = local_clock();
1192
1193         BUG_ON(!parent && (b != btree_node_root(c, b)));
1194         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
1195
1196         bch2_btree_interior_update_will_free_node(as, b);
1197
1198         n1 = bch2_btree_node_alloc_replacement(as, b);
1199         bch2_btree_update_add_new_node(as, n1);
1200
1201         if (keys)
1202                 btree_split_insert_keys(as, n1, iter, keys);
1203
1204         if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1205                 trace_btree_split(c, b);
1206
1207                 n2 = __btree_split_node(as, n1, iter);
1208
1209                 bch2_btree_build_aux_trees(n2);
1210                 bch2_btree_build_aux_trees(n1);
1211                 six_unlock_write(&n2->c.lock);
1212                 six_unlock_write(&n1->c.lock);
1213
1214                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1215
1216                 /*
1217                  * Note that on recursive parent_keys == keys, so we
1218                  * can't start adding new keys to parent_keys before emptying it
1219                  * out (which we did with btree_split_insert_keys() above)
1220                  */
1221                 bch2_keylist_add(&as->parent_keys, &n1->key);
1222                 bch2_keylist_add(&as->parent_keys, &n2->key);
1223
1224                 if (!parent) {
1225                         /* Depth increases, make a new root */
1226                         n3 = __btree_root_alloc(as, b->c.level + 1);
1227
1228                         n3->sib_u64s[0] = U16_MAX;
1229                         n3->sib_u64s[1] = U16_MAX;
1230
1231                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1232
1233                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1234                 }
1235         } else {
1236                 trace_btree_compact(c, b);
1237
1238                 bch2_btree_build_aux_trees(n1);
1239                 six_unlock_write(&n1->c.lock);
1240
1241                 if (parent)
1242                         bch2_keylist_add(&as->parent_keys, &n1->key);
1243         }
1244
1245         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1246
1247         /* New nodes all written, now make them visible: */
1248
1249         if (parent) {
1250                 /* Split a non root node */
1251                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1252         } else if (n3) {
1253                 bch2_btree_set_root(as, n3, iter);
1254         } else {
1255                 /* Root filled up but didn't need to be split */
1256                 bch2_btree_set_root(as, n1, iter);
1257         }
1258
1259         bch2_btree_update_get_open_buckets(as, n1);
1260         if (n2)
1261                 bch2_btree_update_get_open_buckets(as, n2);
1262         if (n3)
1263                 bch2_btree_update_get_open_buckets(as, n3);
1264
1265         /* Successful split, update the iterator to point to the new nodes: */
1266
1267         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1268         bch2_btree_iter_node_drop(iter, b);
1269         if (n3)
1270                 bch2_btree_iter_node_replace(iter, n3);
1271         if (n2)
1272                 bch2_btree_iter_node_replace(iter, n2);
1273         bch2_btree_iter_node_replace(iter, n1);
1274
1275         /*
1276          * The old node must be freed (in memory) _before_ unlocking the new
1277          * nodes - else another thread could re-acquire a read lock on the old
1278          * node after another thread has locked and updated the new node, thus
1279          * seeing stale data:
1280          */
1281         bch2_btree_node_free_inmem(c, b, iter);
1282
1283         if (n3)
1284                 six_unlock_intent(&n3->c.lock);
1285         if (n2)
1286                 six_unlock_intent(&n2->c.lock);
1287         six_unlock_intent(&n1->c.lock);
1288
1289         bch2_btree_trans_verify_locks(iter->trans);
1290
1291         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
1292                                start_time);
1293 }
1294
1295 static void
1296 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1297                                 struct btree_iter *iter, struct keylist *keys)
1298 {
1299         struct btree_iter *linked;
1300         struct btree_node_iter node_iter;
1301         struct bkey_i *insert = bch2_keylist_front(keys);
1302         struct bkey_packed *k;
1303
1304         /* Don't screw up @iter's position: */
1305         node_iter = iter->l[b->c.level].iter;
1306
1307         /*
1308          * btree_split(), btree_gc_coalesce() will insert keys before
1309          * the iterator's current position - they know the keys go in
1310          * the node the iterator points to:
1311          */
1312         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1313                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1314                 ;
1315
1316         for_each_keylist_key(keys, insert)
1317                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1318
1319         btree_update_updated_node(as, b);
1320
1321         trans_for_each_iter_with_node(iter->trans, b, linked)
1322                 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1323
1324         bch2_btree_trans_verify_iters(iter->trans, b);
1325 }
1326
1327 /**
1328  * bch_btree_insert_node - insert bkeys into a given btree node
1329  *
1330  * @iter:               btree iterator
1331  * @keys:               list of keys to insert
1332  * @hook:               insert callback
1333  * @persistent:         if not null, @persistent will wait on journal write
1334  *
1335  * Inserts as many keys as it can into a given btree node, splitting it if full.
1336  * If a split occurred, this function will return early. This can only happen
1337  * for leaf nodes -- inserts into interior nodes have to be atomic.
1338  */
1339 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1340                             struct btree_iter *iter, struct keylist *keys,
1341                             unsigned flags)
1342 {
1343         struct bch_fs *c = as->c;
1344         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1345         int old_live_u64s = b->nr.live_u64s;
1346         int live_u64s_added, u64s_added;
1347
1348         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
1349         BUG_ON(!b->c.level);
1350         BUG_ON(!as || as->b);
1351         bch2_verify_keylist_sorted(keys);
1352
1353         if (as->must_rewrite)
1354                 goto split;
1355
1356         bch2_btree_node_lock_for_insert(c, b, iter);
1357
1358         if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
1359                 bch2_btree_node_unlock_write(b, iter);
1360                 goto split;
1361         }
1362
1363         bch2_btree_insert_keys_interior(as, b, iter, keys);
1364
1365         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1366         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1367
1368         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1369                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1370         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1371                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1372
1373         if (u64s_added > live_u64s_added &&
1374             bch2_maybe_compact_whiteouts(c, b))
1375                 bch2_btree_iter_reinit_node(iter, b);
1376
1377         bch2_btree_node_unlock_write(b, iter);
1378
1379         btree_node_interior_verify(b);
1380
1381         /*
1382          * when called from the btree_split path the new nodes aren't added to
1383          * the btree iterator yet, so the merge path's unlock/wait/relock dance
1384          * won't work:
1385          */
1386         bch2_foreground_maybe_merge(c, iter, b->c.level,
1387                                     flags|BTREE_INSERT_NOUNLOCK);
1388         return;
1389 split:
1390         btree_split(as, b, iter, keys, flags);
1391 }
1392
1393 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1394                           unsigned flags)
1395 {
1396         struct btree_trans *trans = iter->trans;
1397         struct btree *b = iter_l(iter)->b;
1398         struct btree_update *as;
1399         struct closure cl;
1400         int ret = 0;
1401         struct btree_iter *linked;
1402
1403         /*
1404          * We already have a disk reservation and open buckets pinned; this
1405          * allocation must not block:
1406          */
1407         trans_for_each_iter(trans, linked)
1408                 if (linked->btree_id == BTREE_ID_EXTENTS)
1409                         flags |= BTREE_INSERT_USE_RESERVE;
1410
1411         closure_init_stack(&cl);
1412
1413         /* Hack, because gc and splitting nodes doesn't mix yet: */
1414         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1415             !down_read_trylock(&c->gc_lock)) {
1416                 if (flags & BTREE_INSERT_NOUNLOCK) {
1417                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
1418                         return -EINTR;
1419                 }
1420
1421                 bch2_trans_unlock(trans);
1422                 down_read(&c->gc_lock);
1423
1424                 if (!bch2_trans_relock(trans))
1425                         ret = -EINTR;
1426         }
1427
1428         /*
1429          * XXX: figure out how far we might need to split,
1430          * instead of locking/reserving all the way to the root:
1431          */
1432         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1433                 trace_trans_restart_iter_upgrade(trans->ip);
1434                 ret = -EINTR;
1435                 goto out;
1436         }
1437
1438         as = bch2_btree_update_start(trans, iter->btree_id,
1439                 btree_update_reserve_required(c, b), flags,
1440                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1441         if (IS_ERR(as)) {
1442                 ret = PTR_ERR(as);
1443                 if (ret == -EAGAIN) {
1444                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1445                         bch2_trans_unlock(trans);
1446                         ret = -EINTR;
1447
1448                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
1449                 }
1450                 goto out;
1451         }
1452
1453         btree_split(as, b, iter, NULL, flags);
1454         bch2_btree_update_done(as);
1455
1456         /*
1457          * We haven't successfully inserted yet, so don't downgrade all the way
1458          * back to read locks;
1459          */
1460         __bch2_btree_iter_downgrade(iter, 1);
1461 out:
1462         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1463                 up_read(&c->gc_lock);
1464         closure_sync(&cl);
1465         return ret;
1466 }
1467
1468 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1469                                    struct btree_iter *iter,
1470                                    unsigned level,
1471                                    unsigned flags,
1472                                    enum btree_node_sibling sib)
1473 {
1474         struct btree_trans *trans = iter->trans;
1475         struct btree_update *as;
1476         struct bkey_format_state new_s;
1477         struct bkey_format new_f;
1478         struct bkey_i delete;
1479         struct btree *b, *m, *n, *prev, *next, *parent;
1480         struct closure cl;
1481         size_t sib_u64s;
1482         int ret = 0;
1483
1484         BUG_ON(!btree_node_locked(iter, level));
1485
1486         closure_init_stack(&cl);
1487 retry:
1488         BUG_ON(!btree_node_locked(iter, level));
1489
1490         b = iter->l[level].b;
1491
1492         parent = btree_node_parent(iter, b);
1493         if (!parent)
1494                 goto out;
1495
1496         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1497                 goto out;
1498
1499         /* XXX: can't be holding read locks */
1500         m = bch2_btree_node_get_sibling(c, iter, b, sib);
1501         if (IS_ERR(m)) {
1502                 ret = PTR_ERR(m);
1503                 goto err;
1504         }
1505
1506         /* NULL means no sibling: */
1507         if (!m) {
1508                 b->sib_u64s[sib] = U16_MAX;
1509                 goto out;
1510         }
1511
1512         if (sib == btree_prev_sib) {
1513                 prev = m;
1514                 next = b;
1515         } else {
1516                 prev = b;
1517                 next = m;
1518         }
1519
1520         bch2_bkey_format_init(&new_s);
1521         __bch2_btree_calc_format(&new_s, b);
1522         __bch2_btree_calc_format(&new_s, m);
1523         new_f = bch2_bkey_format_done(&new_s);
1524
1525         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1526                 btree_node_u64s_with_format(m, &new_f);
1527
1528         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1529                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1530                 sib_u64s /= 2;
1531                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1532         }
1533
1534         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1535         b->sib_u64s[sib] = sib_u64s;
1536
1537         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1538                 six_unlock_intent(&m->c.lock);
1539                 goto out;
1540         }
1541
1542         /* We're changing btree topology, doesn't mix with gc: */
1543         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1544             !down_read_trylock(&c->gc_lock))
1545                 goto err_cycle_gc_lock;
1546
1547         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1548                 ret = -EINTR;
1549                 goto err_unlock;
1550         }
1551
1552         as = bch2_btree_update_start(trans, iter->btree_id,
1553                          btree_update_reserve_required(c, parent) + 1,
1554                          flags|
1555                          BTREE_INSERT_NOFAIL|
1556                          BTREE_INSERT_USE_RESERVE,
1557                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1558         if (IS_ERR(as)) {
1559                 ret = PTR_ERR(as);
1560                 goto err_unlock;
1561         }
1562
1563         trace_btree_merge(c, b);
1564
1565         bch2_btree_interior_update_will_free_node(as, b);
1566         bch2_btree_interior_update_will_free_node(as, m);
1567
1568         n = bch2_btree_node_alloc(as, b->c.level);
1569         bch2_btree_update_add_new_node(as, n);
1570
1571         btree_set_min(n, prev->data->min_key);
1572         btree_set_max(n, next->data->max_key);
1573         n->data->format         = new_f;
1574
1575         btree_node_set_format(n, new_f);
1576
1577         bch2_btree_sort_into(c, n, prev);
1578         bch2_btree_sort_into(c, n, next);
1579
1580         bch2_btree_build_aux_trees(n);
1581         six_unlock_write(&n->c.lock);
1582
1583         bkey_init(&delete.k);
1584         delete.k.p = prev->key.k.p;
1585         bch2_keylist_add(&as->parent_keys, &delete);
1586         bch2_keylist_add(&as->parent_keys, &n->key);
1587
1588         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1589
1590         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1591
1592         bch2_btree_update_get_open_buckets(as, n);
1593
1594         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1595         bch2_btree_iter_node_drop(iter, b);
1596         bch2_btree_iter_node_drop(iter, m);
1597
1598         bch2_btree_iter_node_replace(iter, n);
1599
1600         bch2_btree_trans_verify_iters(trans, n);
1601
1602         bch2_btree_node_free_inmem(c, b, iter);
1603         bch2_btree_node_free_inmem(c, m, iter);
1604
1605         six_unlock_intent(&n->c.lock);
1606
1607         bch2_btree_update_done(as);
1608
1609         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1610                 up_read(&c->gc_lock);
1611 out:
1612         bch2_btree_trans_verify_locks(trans);
1613
1614         /*
1615          * Don't downgrade locks here: we're called after successful insert,
1616          * and the caller will downgrade locks after a successful insert
1617          * anyways (in case e.g. a split was required first)
1618          *
1619          * And we're also called when inserting into interior nodes in the
1620          * split path, and downgrading to read locks in there is potentially
1621          * confusing:
1622          */
1623         closure_sync(&cl);
1624         return;
1625
1626 err_cycle_gc_lock:
1627         six_unlock_intent(&m->c.lock);
1628
1629         if (flags & BTREE_INSERT_NOUNLOCK)
1630                 goto out;
1631
1632         bch2_trans_unlock(trans);
1633
1634         down_read(&c->gc_lock);
1635         up_read(&c->gc_lock);
1636         ret = -EINTR;
1637         goto err;
1638
1639 err_unlock:
1640         six_unlock_intent(&m->c.lock);
1641         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1642                 up_read(&c->gc_lock);
1643 err:
1644         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1645
1646         if ((ret == -EAGAIN || ret == -EINTR) &&
1647             !(flags & BTREE_INSERT_NOUNLOCK)) {
1648                 bch2_trans_unlock(trans);
1649                 closure_sync(&cl);
1650                 ret = bch2_btree_iter_traverse(iter);
1651                 if (ret)
1652                         goto out;
1653
1654                 goto retry;
1655         }
1656
1657         goto out;
1658 }
1659
1660 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1661                                 struct btree *b, unsigned flags,
1662                                 struct closure *cl)
1663 {
1664         struct btree *n, *parent = btree_node_parent(iter, b);
1665         struct btree_update *as;
1666
1667         as = bch2_btree_update_start(iter->trans, iter->btree_id,
1668                 (parent
1669                  ? btree_update_reserve_required(c, parent)
1670                  : 0) + 1,
1671                 flags, cl);
1672         if (IS_ERR(as)) {
1673                 trace_btree_gc_rewrite_node_fail(c, b);
1674                 return PTR_ERR(as);
1675         }
1676
1677         bch2_btree_interior_update_will_free_node(as, b);
1678
1679         n = bch2_btree_node_alloc_replacement(as, b);
1680         bch2_btree_update_add_new_node(as, n);
1681
1682         bch2_btree_build_aux_trees(n);
1683         six_unlock_write(&n->c.lock);
1684
1685         trace_btree_gc_rewrite_node(c, b);
1686
1687         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1688
1689         if (parent) {
1690                 bch2_keylist_add(&as->parent_keys, &n->key);
1691                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1692         } else {
1693                 bch2_btree_set_root(as, n, iter);
1694         }
1695
1696         bch2_btree_update_get_open_buckets(as, n);
1697
1698         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1699         bch2_btree_iter_node_drop(iter, b);
1700         bch2_btree_iter_node_replace(iter, n);
1701         bch2_btree_node_free_inmem(c, b, iter);
1702         six_unlock_intent(&n->c.lock);
1703
1704         bch2_btree_update_done(as);
1705         return 0;
1706 }
1707
1708 /**
1709  * bch_btree_node_rewrite - Rewrite/move a btree node
1710  *
1711  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1712  * btree_check_reserve() has to wait)
1713  */
1714 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1715                             __le64 seq, unsigned flags)
1716 {
1717         struct btree_trans *trans = iter->trans;
1718         struct closure cl;
1719         struct btree *b;
1720         int ret;
1721
1722         flags |= BTREE_INSERT_NOFAIL;
1723
1724         closure_init_stack(&cl);
1725
1726         bch2_btree_iter_upgrade(iter, U8_MAX);
1727
1728         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1729                 if (!down_read_trylock(&c->gc_lock)) {
1730                         bch2_trans_unlock(trans);
1731                         down_read(&c->gc_lock);
1732                 }
1733         }
1734
1735         while (1) {
1736                 ret = bch2_btree_iter_traverse(iter);
1737                 if (ret)
1738                         break;
1739
1740                 b = bch2_btree_iter_peek_node(iter);
1741                 if (!b || b->data->keys.seq != seq)
1742                         break;
1743
1744                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1745                 if (ret != -EAGAIN &&
1746                     ret != -EINTR)
1747                         break;
1748
1749                 bch2_trans_unlock(trans);
1750                 closure_sync(&cl);
1751         }
1752
1753         bch2_btree_iter_downgrade(iter);
1754
1755         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1756                 up_read(&c->gc_lock);
1757
1758         closure_sync(&cl);
1759         return ret;
1760 }
1761
1762 static void __bch2_btree_node_update_key(struct bch_fs *c,
1763                                          struct btree_update *as,
1764                                          struct btree_iter *iter,
1765                                          struct btree *b, struct btree *new_hash,
1766                                          struct bkey_i *new_key)
1767 {
1768         struct btree *parent;
1769         int ret;
1770
1771         btree_update_will_delete_key(as, &b->key);
1772         btree_update_will_add_key(as, new_key);
1773
1774         parent = btree_node_parent(iter, b);
1775         if (parent) {
1776                 if (new_hash) {
1777                         bkey_copy(&new_hash->key, new_key);
1778                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
1779                                         new_hash, b->c.level, b->c.btree_id);
1780                         BUG_ON(ret);
1781                 }
1782
1783                 bch2_keylist_add(&as->parent_keys, new_key);
1784                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
1785
1786                 if (new_hash) {
1787                         mutex_lock(&c->btree_cache.lock);
1788                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
1789
1790                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1791
1792                         bkey_copy(&b->key, new_key);
1793                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1794                         BUG_ON(ret);
1795                         mutex_unlock(&c->btree_cache.lock);
1796                 } else {
1797                         bkey_copy(&b->key, new_key);
1798                 }
1799         } else {
1800                 BUG_ON(btree_node_root(c, b) != b);
1801
1802                 bch2_btree_node_lock_write(b, iter);
1803                 bkey_copy(&b->key, new_key);
1804
1805                 if (btree_ptr_hash_val(&b->key) != b->hash_val) {
1806                         mutex_lock(&c->btree_cache.lock);
1807                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1808
1809                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1810                         BUG_ON(ret);
1811                         mutex_unlock(&c->btree_cache.lock);
1812                 }
1813
1814                 btree_update_updated_root(as, b);
1815                 bch2_btree_node_unlock_write(b, iter);
1816         }
1817
1818         bch2_btree_update_done(as);
1819 }
1820
1821 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
1822                                struct btree *b,
1823                                struct bkey_i *new_key)
1824 {
1825         struct btree *parent = btree_node_parent(iter, b);
1826         struct btree_update *as = NULL;
1827         struct btree *new_hash = NULL;
1828         struct closure cl;
1829         int ret;
1830
1831         closure_init_stack(&cl);
1832
1833         if (!bch2_btree_iter_upgrade(iter, U8_MAX))
1834                 return -EINTR;
1835
1836         if (!down_read_trylock(&c->gc_lock)) {
1837                 bch2_trans_unlock(iter->trans);
1838                 down_read(&c->gc_lock);
1839
1840                 if (!bch2_trans_relock(iter->trans)) {
1841                         ret = -EINTR;
1842                         goto err;
1843                 }
1844         }
1845
1846         /*
1847          * check btree_ptr_hash_val() after @b is locked by
1848          * btree_iter_traverse():
1849          */
1850         if (btree_ptr_hash_val(new_key) != b->hash_val) {
1851                 /* bch2_btree_reserve_get will unlock */
1852                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1853                 if (ret) {
1854                         bch2_trans_unlock(iter->trans);
1855                         up_read(&c->gc_lock);
1856                         closure_sync(&cl);
1857                         down_read(&c->gc_lock);
1858
1859                         if (!bch2_trans_relock(iter->trans)) {
1860                                 ret = -EINTR;
1861                                 goto err;
1862                         }
1863                 }
1864
1865                 new_hash = bch2_btree_node_mem_alloc(c);
1866         }
1867
1868         as = bch2_btree_update_start(iter->trans, iter->btree_id,
1869                 parent ? btree_update_reserve_required(c, parent) : 0,
1870                 BTREE_INSERT_NOFAIL|
1871                 BTREE_INSERT_USE_RESERVE|
1872                 BTREE_INSERT_USE_ALLOC_RESERVE,
1873                 &cl);
1874
1875         if (IS_ERR(as)) {
1876                 ret = PTR_ERR(as);
1877                 if (ret == -EAGAIN)
1878                         ret = -EINTR;
1879
1880                 if (ret != -EINTR)
1881                         goto err;
1882
1883                 bch2_trans_unlock(iter->trans);
1884                 up_read(&c->gc_lock);
1885                 closure_sync(&cl);
1886                 down_read(&c->gc_lock);
1887
1888                 if (!bch2_trans_relock(iter->trans))
1889                         goto err;
1890         }
1891
1892         ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
1893         if (ret)
1894                 goto err_free_update;
1895
1896         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
1897
1898         bch2_btree_iter_downgrade(iter);
1899 err:
1900         if (new_hash) {
1901                 mutex_lock(&c->btree_cache.lock);
1902                 list_move(&new_hash->list, &c->btree_cache.freeable);
1903                 mutex_unlock(&c->btree_cache.lock);
1904
1905                 six_unlock_write(&new_hash->c.lock);
1906                 six_unlock_intent(&new_hash->c.lock);
1907         }
1908         up_read(&c->gc_lock);
1909         closure_sync(&cl);
1910         return ret;
1911 err_free_update:
1912         bch2_btree_update_free(as);
1913         goto err;
1914 }
1915
1916 /* Init code: */
1917
1918 /*
1919  * Only for filesystem bringup, when first reading the btree roots or allocating
1920  * btree roots when initializing a new filesystem:
1921  */
1922 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
1923 {
1924         BUG_ON(btree_node_root(c, b));
1925
1926         bch2_btree_set_root_inmem(c, b);
1927 }
1928
1929 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
1930 {
1931         struct closure cl;
1932         struct btree *b;
1933         int ret;
1934
1935         closure_init_stack(&cl);
1936
1937         do {
1938                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1939                 closure_sync(&cl);
1940         } while (ret);
1941
1942         b = bch2_btree_node_mem_alloc(c);
1943         bch2_btree_cache_cannibalize_unlock(c);
1944
1945         set_btree_node_fake(b);
1946         b->c.level      = 0;
1947         b->c.btree_id   = id;
1948
1949         bkey_btree_ptr_init(&b->key);
1950         b->key.k.p = POS_MAX;
1951         *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
1952
1953         bch2_bset_init_first(b, &b->data->keys);
1954         bch2_btree_build_aux_trees(b);
1955
1956         b->data->flags = 0;
1957         btree_set_min(b, POS_MIN);
1958         btree_set_max(b, POS_MAX);
1959         b->data->format = bch2_btree_calc_format(b);
1960         btree_node_set_format(b, b->data->format);
1961
1962         ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
1963                                           b->c.level, b->c.btree_id);
1964         BUG_ON(ret);
1965
1966         bch2_btree_set_root_inmem(c, b);
1967
1968         six_unlock_write(&b->c.lock);
1969         six_unlock_intent(&b->c.lock);
1970 }
1971
1972 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
1973 {
1974         struct printbuf out = _PBUF(buf, PAGE_SIZE);
1975         struct btree_update *as;
1976
1977         mutex_lock(&c->btree_interior_update_lock);
1978         list_for_each_entry(as, &c->btree_interior_update_list, list)
1979                 pr_buf(&out, "%p m %u w %u r %u j %llu\n",
1980                        as,
1981                        as->mode,
1982                        as->nodes_written,
1983                        atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
1984                        as->journal.seq);
1985         mutex_unlock(&c->btree_interior_update_lock);
1986
1987         return out.pos - buf;
1988 }
1989
1990 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
1991 {
1992         size_t ret = 0;
1993         struct list_head *i;
1994
1995         mutex_lock(&c->btree_interior_update_lock);
1996         list_for_each(i, &c->btree_interior_update_list)
1997                 ret++;
1998         mutex_unlock(&c->btree_interior_update_lock);
1999
2000         return ret;
2001 }
2002
2003 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
2004 {
2005         struct btree_root *r;
2006         struct jset_entry *entry;
2007
2008         mutex_lock(&c->btree_root_lock);
2009
2010         vstruct_for_each(jset, entry)
2011                 if (entry->type == BCH_JSET_ENTRY_btree_root) {
2012                         r = &c->btree_roots[entry->btree_id];
2013                         r->level = entry->level;
2014                         r->alive = true;
2015                         bkey_copy(&r->key, &entry->start[0]);
2016                 }
2017
2018         mutex_unlock(&c->btree_root_lock);
2019 }
2020
2021 struct jset_entry *
2022 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
2023                                     struct jset_entry *start,
2024                                     struct jset_entry *end)
2025 {
2026         struct jset_entry *entry;
2027         unsigned long have = 0;
2028         unsigned i;
2029
2030         for (entry = start; entry < end; entry = vstruct_next(entry))
2031                 if (entry->type == BCH_JSET_ENTRY_btree_root)
2032                         __set_bit(entry->btree_id, &have);
2033
2034         mutex_lock(&c->btree_root_lock);
2035
2036         for (i = 0; i < BTREE_ID_NR; i++)
2037                 if (c->btree_roots[i].alive && !test_bit(i, &have)) {
2038                         journal_entry_set(end,
2039                                           BCH_JSET_ENTRY_btree_root,
2040                                           i, c->btree_roots[i].level,
2041                                           &c->btree_roots[i].key,
2042                                           c->btree_roots[i].key.u64s);
2043                         end = vstruct_next(end);
2044                 }
2045
2046         mutex_unlock(&c->btree_root_lock);
2047
2048         return end;
2049 }
2050
2051 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
2052 {
2053         if (c->btree_interior_update_worker)
2054                 destroy_workqueue(c->btree_interior_update_worker);
2055         mempool_exit(&c->btree_interior_update_pool);
2056 }
2057
2058 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
2059 {
2060         mutex_init(&c->btree_reserve_cache_lock);
2061         INIT_LIST_HEAD(&c->btree_interior_update_list);
2062         INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
2063         mutex_init(&c->btree_interior_update_lock);
2064         INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
2065
2066         c->btree_interior_update_worker =
2067                 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
2068         if (!c->btree_interior_update_worker)
2069                 return -ENOMEM;
2070
2071         return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
2072                                          sizeof(struct btree_update));
2073 }