]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
5143896e1b29e1f07e71d106789ad69587ba887b
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_gc.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_io.h"
11 #include "btree_iter.h"
12 #include "btree_locking.h"
13 #include "buckets.h"
14 #include "error.h"
15 #include "extents.h"
16 #include "journal.h"
17 #include "journal_reclaim.h"
18 #include "keylist.h"
19 #include "replicas.h"
20 #include "super-io.h"
21
22 #include <linux/random.h>
23 #include <trace/events/bcachefs.h>
24
25 /* Debug code: */
26
27 /*
28  * Verify that child nodes correctly span parent node's range:
29  */
30 static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
31 {
32 #ifdef CONFIG_BCACHEFS_DEBUG
33         struct bpos next_node = b->data->min_key;
34         struct btree_node_iter iter;
35         struct bkey_s_c k;
36         struct bkey_s_c_btree_ptr_v2 bp;
37         struct bkey unpacked;
38
39         BUG_ON(!b->c.level);
40
41         if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
42                 return;
43
44         bch2_btree_node_iter_init_from_start(&iter, b);
45
46         while (1) {
47                 k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
48                 if (k.k->type != KEY_TYPE_btree_ptr_v2)
49                         break;
50                 bp = bkey_s_c_to_btree_ptr_v2(k);
51
52                 if (bkey_cmp(next_node, bp.v->min_key)) {
53                         bch2_dump_btree_node(c, b);
54                         panic("expected next min_key %llu:%llu got %llu:%llu\n",
55                               next_node.inode,
56                               next_node.offset,
57                               bp.v->min_key.inode,
58                               bp.v->min_key.offset);
59                 }
60
61                 bch2_btree_node_iter_advance(&iter, b);
62
63                 if (bch2_btree_node_iter_end(&iter)) {
64
65                         if (bkey_cmp(k.k->p, b->key.k.p)) {
66                                 bch2_dump_btree_node(c, b);
67                                 panic("expected end %llu:%llu got %llu:%llu\n",
68                                       b->key.k.p.inode,
69                                       b->key.k.p.offset,
70                                       k.k->p.inode,
71                                       k.k->p.offset);
72                         }
73                         break;
74                 }
75
76                 next_node = bkey_successor(k.k->p);
77         }
78 #endif
79 }
80
81 /* Calculate ideal packed bkey format for new btree nodes: */
82
83 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
84 {
85         struct bkey_packed *k;
86         struct bset_tree *t;
87         struct bkey uk;
88
89         bch2_bkey_format_add_pos(s, b->data->min_key);
90
91         for_each_bset(b, t)
92                 bset_tree_for_each_key(b, t, k)
93                         if (!bkey_whiteout(k)) {
94                                 uk = bkey_unpack_key(b, k);
95                                 bch2_bkey_format_add_key(s, &uk);
96                         }
97 }
98
99 static struct bkey_format bch2_btree_calc_format(struct btree *b)
100 {
101         struct bkey_format_state s;
102
103         bch2_bkey_format_init(&s);
104         __bch2_btree_calc_format(&s, b);
105
106         return bch2_bkey_format_done(&s);
107 }
108
109 static size_t btree_node_u64s_with_format(struct btree *b,
110                                           struct bkey_format *new_f)
111 {
112         struct bkey_format *old_f = &b->format;
113
114         /* stupid integer promotion rules */
115         ssize_t delta =
116             (((int) new_f->key_u64s - old_f->key_u64s) *
117              (int) b->nr.packed_keys) +
118             (((int) new_f->key_u64s - BKEY_U64s) *
119              (int) b->nr.unpacked_keys);
120
121         BUG_ON(delta + b->nr.live_u64s < 0);
122
123         return b->nr.live_u64s + delta;
124 }
125
126 /**
127  * btree_node_format_fits - check if we could rewrite node with a new format
128  *
129  * This assumes all keys can pack with the new format -- it just checks if
130  * the re-packed keys would fit inside the node itself.
131  */
132 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
133                                  struct bkey_format *new_f)
134 {
135         size_t u64s = btree_node_u64s_with_format(b, new_f);
136
137         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
138 }
139
140 /* Btree node freeing/allocation: */
141
142 static void __btree_node_free(struct bch_fs *c, struct btree *b)
143 {
144         trace_btree_node_free(c, b);
145
146         BUG_ON(btree_node_dirty(b));
147         BUG_ON(btree_node_need_write(b));
148         BUG_ON(b == btree_node_root(c, b));
149         BUG_ON(b->ob.nr);
150         BUG_ON(!list_empty(&b->write_blocked));
151         BUG_ON(b->will_make_reachable);
152
153         clear_btree_node_noevict(b);
154
155         bch2_btree_node_hash_remove(&c->btree_cache, b);
156
157         mutex_lock(&c->btree_cache.lock);
158         list_move(&b->list, &c->btree_cache.freeable);
159         mutex_unlock(&c->btree_cache.lock);
160 }
161
162 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
163 {
164         struct open_buckets ob = b->ob;
165
166         b->ob.nr = 0;
167
168         clear_btree_node_dirty(c, b);
169
170         btree_node_lock_type(c, b, SIX_LOCK_write);
171         __btree_node_free(c, b);
172         six_unlock_write(&b->c.lock);
173
174         bch2_open_buckets_put(c, &ob);
175 }
176
177 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
178                                 struct btree_iter *iter)
179 {
180         struct btree_iter *linked;
181
182         trans_for_each_iter(iter->trans, linked)
183                 BUG_ON(linked->l[b->c.level].b == b);
184
185         six_lock_write(&b->c.lock, NULL, NULL);
186         __btree_node_free(c, b);
187         six_unlock_write(&b->c.lock);
188         six_unlock_intent(&b->c.lock);
189 }
190
191 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
192                                              struct disk_reservation *res,
193                                              struct closure *cl,
194                                              unsigned flags)
195 {
196         struct write_point *wp;
197         struct btree *b;
198         BKEY_PADDED(k) tmp;
199         struct open_buckets ob = { .nr = 0 };
200         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
201         unsigned nr_reserve;
202         enum alloc_reserve alloc_reserve;
203
204         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
205                 nr_reserve      = 0;
206                 alloc_reserve   = RESERVE_ALLOC;
207         } else if (flags & BTREE_INSERT_USE_RESERVE) {
208                 nr_reserve      = BTREE_NODE_RESERVE / 2;
209                 alloc_reserve   = RESERVE_BTREE;
210         } else {
211                 nr_reserve      = BTREE_NODE_RESERVE;
212                 alloc_reserve   = RESERVE_NONE;
213         }
214
215         mutex_lock(&c->btree_reserve_cache_lock);
216         if (c->btree_reserve_cache_nr > nr_reserve) {
217                 struct btree_alloc *a =
218                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
219
220                 ob = a->ob;
221                 bkey_copy(&tmp.k, &a->k);
222                 mutex_unlock(&c->btree_reserve_cache_lock);
223                 goto mem_alloc;
224         }
225         mutex_unlock(&c->btree_reserve_cache_lock);
226
227 retry:
228         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
229                                       writepoint_ptr(&c->btree_write_point),
230                                       &devs_have,
231                                       res->nr_replicas,
232                                       c->opts.metadata_replicas_required,
233                                       alloc_reserve, 0, cl);
234         if (IS_ERR(wp))
235                 return ERR_CAST(wp);
236
237         if (wp->sectors_free < c->opts.btree_node_size) {
238                 struct open_bucket *ob;
239                 unsigned i;
240
241                 open_bucket_for_each(c, &wp->ptrs, ob, i)
242                         if (ob->sectors_free < c->opts.btree_node_size)
243                                 ob->sectors_free = 0;
244
245                 bch2_alloc_sectors_done(c, wp);
246                 goto retry;
247         }
248
249         if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
250                 bkey_btree_ptr_v2_init(&tmp.k);
251         else
252                 bkey_btree_ptr_init(&tmp.k);
253
254         bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
255
256         bch2_open_bucket_get(c, wp, &ob);
257         bch2_alloc_sectors_done(c, wp);
258 mem_alloc:
259         b = bch2_btree_node_mem_alloc(c);
260
261         /* we hold cannibalize_lock: */
262         BUG_ON(IS_ERR(b));
263         BUG_ON(b->ob.nr);
264
265         bkey_copy(&b->key, &tmp.k);
266         b->ob = ob;
267
268         return b;
269 }
270
271 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
272 {
273         struct bch_fs *c = as->c;
274         struct btree *b;
275         int ret;
276
277         BUG_ON(level >= BTREE_MAX_DEPTH);
278         BUG_ON(!as->nr_prealloc_nodes);
279
280         b = as->prealloc_nodes[--as->nr_prealloc_nodes];
281
282         set_btree_node_accessed(b);
283         set_btree_node_dirty(c, b);
284         set_btree_node_need_write(b);
285
286         bch2_bset_init_first(b, &b->data->keys);
287         b->c.level      = level;
288         b->c.btree_id   = as->btree_id;
289
290         memset(&b->nr, 0, sizeof(b->nr));
291         b->data->magic = cpu_to_le64(bset_magic(c));
292         b->data->flags = 0;
293         SET_BTREE_NODE_ID(b->data, as->btree_id);
294         SET_BTREE_NODE_LEVEL(b->data, level);
295         b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
296
297         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
298                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
299
300                 bp->v.mem_ptr           = 0;
301                 bp->v.seq               = b->data->keys.seq;
302                 bp->v.sectors_written   = 0;
303                 bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
304         }
305
306         if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
307                 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
308
309         if (btree_node_is_extents(b) &&
310             !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) {
311                 set_btree_node_old_extent_overwrite(b);
312                 set_btree_node_need_rewrite(b);
313         }
314
315         bch2_btree_build_aux_trees(b);
316
317         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
318         BUG_ON(ret);
319
320         trace_btree_node_alloc(c, b);
321         return b;
322 }
323
324 static void btree_set_min(struct btree *b, struct bpos pos)
325 {
326         if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
327                 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
328         b->data->min_key = pos;
329 }
330
331 static void btree_set_max(struct btree *b, struct bpos pos)
332 {
333         b->key.k.p = pos;
334         b->data->max_key = pos;
335 }
336
337 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
338                                                   struct btree *b,
339                                                   struct bkey_format format)
340 {
341         struct btree *n;
342
343         n = bch2_btree_node_alloc(as, b->c.level);
344
345         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
346
347         btree_set_min(n, b->data->min_key);
348         btree_set_max(n, b->data->max_key);
349
350         n->data->format         = format;
351         btree_node_set_format(n, format);
352
353         bch2_btree_sort_into(as->c, n, b);
354
355         btree_node_reset_sib_u64s(n);
356
357         n->key.k.p = b->key.k.p;
358         return n;
359 }
360
361 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
362                                                        struct btree *b)
363 {
364         struct bkey_format new_f = bch2_btree_calc_format(b);
365
366         /*
367          * The keys might expand with the new format - if they wouldn't fit in
368          * the btree node anymore, use the old format for now:
369          */
370         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
371                 new_f = b->format;
372
373         return __bch2_btree_node_alloc_replacement(as, b, new_f);
374 }
375
376 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
377 {
378         struct btree *b = bch2_btree_node_alloc(as, level);
379
380         btree_set_min(b, POS_MIN);
381         btree_set_max(b, POS_MAX);
382         b->data->format = bch2_btree_calc_format(b);
383
384         btree_node_set_format(b, b->data->format);
385         bch2_btree_build_aux_trees(b);
386
387         bch2_btree_update_add_new_node(as, b);
388         six_unlock_write(&b->c.lock);
389
390         return b;
391 }
392
393 static void bch2_btree_reserve_put(struct btree_update *as)
394 {
395         struct bch_fs *c = as->c;
396
397         mutex_lock(&c->btree_reserve_cache_lock);
398
399         while (as->nr_prealloc_nodes) {
400                 struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
401
402                 six_unlock_write(&b->c.lock);
403
404                 if (c->btree_reserve_cache_nr <
405                     ARRAY_SIZE(c->btree_reserve_cache)) {
406                         struct btree_alloc *a =
407                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
408
409                         a->ob = b->ob;
410                         b->ob.nr = 0;
411                         bkey_copy(&a->k, &b->key);
412                 } else {
413                         bch2_open_buckets_put(c, &b->ob);
414                 }
415
416                 btree_node_lock_type(c, b, SIX_LOCK_write);
417                 __btree_node_free(c, b);
418                 six_unlock_write(&b->c.lock);
419
420                 six_unlock_intent(&b->c.lock);
421         }
422
423         mutex_unlock(&c->btree_reserve_cache_lock);
424 }
425
426 static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
427                                   unsigned flags, struct closure *cl)
428 {
429         struct bch_fs *c = as->c;
430         struct btree *b;
431         int ret;
432
433         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
434
435         /*
436          * Protects reaping from the btree node cache and using the btree node
437          * open bucket reserve:
438          */
439         ret = bch2_btree_cache_cannibalize_lock(c, cl);
440         if (ret)
441                 return ret;
442
443         while (as->nr_prealloc_nodes < nr_nodes) {
444                 b = __bch2_btree_node_alloc(c, &as->disk_res,
445                                             flags & BTREE_INSERT_NOWAIT
446                                             ? NULL : cl, flags);
447                 if (IS_ERR(b)) {
448                         ret = PTR_ERR(b);
449                         goto err_free;
450                 }
451
452                 ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
453                 if (ret)
454                         goto err_free;
455
456                 as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
457         }
458
459         bch2_btree_cache_cannibalize_unlock(c);
460         return 0;
461 err_free:
462         bch2_btree_cache_cannibalize_unlock(c);
463         trace_btree_reserve_get_fail(c, nr_nodes, cl);
464         return ret;
465 }
466
467 /* Asynchronous interior node update machinery */
468
469 static void bch2_btree_update_free(struct btree_update *as)
470 {
471         struct bch_fs *c = as->c;
472
473         bch2_journal_preres_put(&c->journal, &as->journal_preres);
474
475         bch2_journal_pin_drop(&c->journal, &as->journal);
476         bch2_journal_pin_flush(&c->journal, &as->journal);
477         bch2_disk_reservation_put(c, &as->disk_res);
478         bch2_btree_reserve_put(as);
479
480         mutex_lock(&c->btree_interior_update_lock);
481         list_del(&as->unwritten_list);
482         list_del(&as->list);
483         mutex_unlock(&c->btree_interior_update_lock);
484
485         closure_debug_destroy(&as->cl);
486         mempool_free(as, &c->btree_interior_update_pool);
487
488         closure_wake_up(&c->btree_interior_update_wait);
489 }
490
491 static void btree_update_will_delete_key(struct btree_update *as,
492                                          struct bkey_i *k)
493 {
494         BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
495                ARRAY_SIZE(as->_old_keys));
496         bch2_keylist_add(&as->old_keys, k);
497 }
498
499 static void btree_update_will_add_key(struct btree_update *as,
500                                       struct bkey_i *k)
501 {
502         BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
503                ARRAY_SIZE(as->_new_keys));
504         bch2_keylist_add(&as->new_keys, k);
505 }
506
507 /*
508  * The transactional part of an interior btree node update, where we journal the
509  * update we did to the interior node and update alloc info:
510  */
511 static int btree_update_nodes_written_trans(struct btree_trans *trans,
512                                             struct btree_update *as)
513 {
514         struct bkey_i *k;
515         int ret;
516
517         trans->extra_journal_entries = (void *) &as->journal_entries[0];
518         trans->extra_journal_entry_u64s = as->journal_u64s;
519         trans->journal_pin = &as->journal;
520
521         for_each_keylist_key(&as->new_keys, k) {
522                 ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
523                                           0, 0, BTREE_TRIGGER_INSERT);
524                 if (ret)
525                         return ret;
526         }
527
528         for_each_keylist_key(&as->old_keys, k) {
529                 ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k),
530                                           0, 0, BTREE_TRIGGER_OVERWRITE);
531                 if (ret)
532                         return ret;
533         }
534
535         return 0;
536 }
537
538 static void btree_update_nodes_written(struct btree_update *as)
539 {
540         struct bch_fs *c = as->c;
541         struct btree *b = as->b;
542         struct btree_trans trans;
543         u64 journal_seq = 0;
544         unsigned i;
545         int ret;
546
547         /*
548          * We did an update to a parent node where the pointers we added pointed
549          * to child nodes that weren't written yet: now, the child nodes have
550          * been written so we can write out the update to the interior node.
551          */
552
553         /*
554          * We can't call into journal reclaim here: we'd block on the journal
555          * reclaim lock, but we may need to release the open buckets we have
556          * pinned in order for other btree updates to make forward progress, and
557          * journal reclaim does btree updates when flushing bkey_cached entries,
558          * which may require allocations as well.
559          */
560         bch2_trans_init(&trans, c, 0, 512);
561         ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
562                               BTREE_INSERT_NOFAIL|
563                               BTREE_INSERT_USE_RESERVE|
564                               BTREE_INSERT_USE_ALLOC_RESERVE|
565                               BTREE_INSERT_NOCHECK_RW|
566                               BTREE_INSERT_JOURNAL_RECLAIM|
567                               BTREE_INSERT_JOURNAL_RESERVED,
568                               btree_update_nodes_written_trans(&trans, as));
569         bch2_trans_exit(&trans);
570         BUG_ON(ret && !bch2_journal_error(&c->journal));
571
572         if (b) {
573                 /*
574                  * @b is the node we did the final insert into:
575                  *
576                  * On failure to get a journal reservation, we still have to
577                  * unblock the write and allow most of the write path to happen
578                  * so that shutdown works, but the i->journal_seq mechanism
579                  * won't work to prevent the btree write from being visible (we
580                  * didn't get a journal sequence number) - instead
581                  * __bch2_btree_node_write() doesn't do the actual write if
582                  * we're in journal error state:
583                  */
584
585                 btree_node_lock_type(c, b, SIX_LOCK_intent);
586                 btree_node_lock_type(c, b, SIX_LOCK_write);
587                 mutex_lock(&c->btree_interior_update_lock);
588
589                 list_del(&as->write_blocked_list);
590
591                 if (!ret && as->b == b) {
592                         struct bset *i = btree_bset_last(b);
593
594                         BUG_ON(!b->c.level);
595                         BUG_ON(!btree_node_dirty(b));
596
597                         i->journal_seq = cpu_to_le64(
598                                 max(journal_seq,
599                                     le64_to_cpu(i->journal_seq)));
600
601                         bch2_btree_add_journal_pin(c, b, journal_seq);
602                 }
603
604                 mutex_unlock(&c->btree_interior_update_lock);
605                 six_unlock_write(&b->c.lock);
606
607                 btree_node_write_if_need(c, b, SIX_LOCK_intent);
608                 six_unlock_intent(&b->c.lock);
609         }
610
611         bch2_journal_pin_drop(&c->journal, &as->journal);
612
613         bch2_journal_preres_put(&c->journal, &as->journal_preres);
614
615         mutex_lock(&c->btree_interior_update_lock);
616         for (i = 0; i < as->nr_new_nodes; i++) {
617                 b = as->new_nodes[i];
618
619                 BUG_ON(b->will_make_reachable != (unsigned long) as);
620                 b->will_make_reachable = 0;
621         }
622         mutex_unlock(&c->btree_interior_update_lock);
623
624         for (i = 0; i < as->nr_new_nodes; i++) {
625                 b = as->new_nodes[i];
626
627                 btree_node_lock_type(c, b, SIX_LOCK_read);
628                 btree_node_write_if_need(c, b, SIX_LOCK_read);
629                 six_unlock_read(&b->c.lock);
630         }
631
632         for (i = 0; i < as->nr_open_buckets; i++)
633                 bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
634
635         bch2_btree_update_free(as);
636 }
637
638 static void btree_interior_update_work(struct work_struct *work)
639 {
640         struct bch_fs *c =
641                 container_of(work, struct bch_fs, btree_interior_update_work);
642         struct btree_update *as;
643
644         while (1) {
645                 mutex_lock(&c->btree_interior_update_lock);
646                 as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
647                                               struct btree_update, unwritten_list);
648                 if (as && !as->nodes_written)
649                         as = NULL;
650                 mutex_unlock(&c->btree_interior_update_lock);
651
652                 if (!as)
653                         break;
654
655                 btree_update_nodes_written(as);
656         }
657 }
658
659 static void btree_update_set_nodes_written(struct closure *cl)
660 {
661         struct btree_update *as = container_of(cl, struct btree_update, cl);
662         struct bch_fs *c = as->c;
663
664         mutex_lock(&c->btree_interior_update_lock);
665         as->nodes_written = true;
666         mutex_unlock(&c->btree_interior_update_lock);
667
668         queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
669 }
670
671 /*
672  * We're updating @b with pointers to nodes that haven't finished writing yet:
673  * block @b from being written until @as completes
674  */
675 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
676 {
677         struct bch_fs *c = as->c;
678
679         mutex_lock(&c->btree_interior_update_lock);
680         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
681
682         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
683         BUG_ON(!btree_node_dirty(b));
684
685         as->mode        = BTREE_INTERIOR_UPDATING_NODE;
686         as->b           = b;
687         list_add(&as->write_blocked_list, &b->write_blocked);
688
689         mutex_unlock(&c->btree_interior_update_lock);
690 }
691
692 static void btree_update_reparent(struct btree_update *as,
693                                   struct btree_update *child)
694 {
695         struct bch_fs *c = as->c;
696
697         lockdep_assert_held(&c->btree_interior_update_lock);
698
699         child->b = NULL;
700         child->mode = BTREE_INTERIOR_UPDATING_AS;
701
702         /*
703          * When we write a new btree root, we have to drop our journal pin
704          * _before_ the new nodes are technically reachable; see
705          * btree_update_nodes_written().
706          *
707          * This goes for journal pins that are recursively blocked on us - so,
708          * just transfer the journal pin to the new interior update so
709          * btree_update_nodes_written() can drop it.
710          */
711         bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
712         bch2_journal_pin_drop(&c->journal, &child->journal);
713 }
714
715 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
716 {
717         struct bkey_i *insert = &b->key;
718         struct bch_fs *c = as->c;
719
720         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
721
722         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
723                ARRAY_SIZE(as->journal_entries));
724
725         as->journal_u64s +=
726                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
727                                   BCH_JSET_ENTRY_btree_root,
728                                   b->c.btree_id, b->c.level,
729                                   insert, insert->k.u64s);
730
731         mutex_lock(&c->btree_interior_update_lock);
732         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
733
734         as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
735         mutex_unlock(&c->btree_interior_update_lock);
736 }
737
738 /*
739  * bch2_btree_update_add_new_node:
740  *
741  * This causes @as to wait on @b to be written, before it gets to
742  * bch2_btree_update_nodes_written
743  *
744  * Additionally, it sets b->will_make_reachable to prevent any additional writes
745  * to @b from happening besides the first until @b is reachable on disk
746  *
747  * And it adds @b to the list of @as's new nodes, so that we can update sector
748  * counts in bch2_btree_update_nodes_written:
749  */
750 void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
751 {
752         struct bch_fs *c = as->c;
753
754         closure_get(&as->cl);
755
756         mutex_lock(&c->btree_interior_update_lock);
757         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
758         BUG_ON(b->will_make_reachable);
759
760         as->new_nodes[as->nr_new_nodes++] = b;
761         b->will_make_reachable = 1UL|(unsigned long) as;
762
763         mutex_unlock(&c->btree_interior_update_lock);
764
765         btree_update_will_add_key(as, &b->key);
766 }
767
768 /*
769  * returns true if @b was a new node
770  */
771 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
772 {
773         struct btree_update *as;
774         unsigned long v;
775         unsigned i;
776
777         mutex_lock(&c->btree_interior_update_lock);
778         /*
779          * When b->will_make_reachable != 0, it owns a ref on as->cl that's
780          * dropped when it gets written by bch2_btree_complete_write - the
781          * xchg() is for synchronization with bch2_btree_complete_write:
782          */
783         v = xchg(&b->will_make_reachable, 0);
784         as = (struct btree_update *) (v & ~1UL);
785
786         if (!as) {
787                 mutex_unlock(&c->btree_interior_update_lock);
788                 return;
789         }
790
791         for (i = 0; i < as->nr_new_nodes; i++)
792                 if (as->new_nodes[i] == b)
793                         goto found;
794
795         BUG();
796 found:
797         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
798         mutex_unlock(&c->btree_interior_update_lock);
799
800         if (v & 1)
801                 closure_put(&as->cl);
802 }
803
804 void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
805 {
806         while (b->ob.nr)
807                 as->open_buckets[as->nr_open_buckets++] =
808                         b->ob.v[--b->ob.nr];
809 }
810
811 /*
812  * @b is being split/rewritten: it may have pointers to not-yet-written btree
813  * nodes and thus outstanding btree_updates - redirect @b's
814  * btree_updates to point to this btree_update:
815  */
816 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
817                                                struct btree *b)
818 {
819         struct bch_fs *c = as->c;
820         struct btree_update *p, *n;
821         struct btree_write *w;
822
823         set_btree_node_dying(b);
824
825         if (btree_node_fake(b))
826                 return;
827
828         mutex_lock(&c->btree_interior_update_lock);
829
830         /*
831          * Does this node have any btree_update operations preventing
832          * it from being written?
833          *
834          * If so, redirect them to point to this btree_update: we can
835          * write out our new nodes, but we won't make them visible until those
836          * operations complete
837          */
838         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
839                 list_del_init(&p->write_blocked_list);
840                 btree_update_reparent(as, p);
841
842                 /*
843                  * for flush_held_btree_writes() waiting on updates to flush or
844                  * nodes to be writeable:
845                  */
846                 closure_wake_up(&c->btree_interior_update_wait);
847         }
848
849         clear_btree_node_dirty(c, b);
850         clear_btree_node_need_write(b);
851
852         /*
853          * Does this node have unwritten data that has a pin on the journal?
854          *
855          * If so, transfer that pin to the btree_update operation -
856          * note that if we're freeing multiple nodes, we only need to keep the
857          * oldest pin of any of the nodes we're freeing. We'll release the pin
858          * when the new nodes are persistent and reachable on disk:
859          */
860         w = btree_current_write(b);
861         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
862         bch2_journal_pin_drop(&c->journal, &w->journal);
863
864         w = btree_prev_write(b);
865         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
866         bch2_journal_pin_drop(&c->journal, &w->journal);
867
868         mutex_unlock(&c->btree_interior_update_lock);
869
870         /*
871          * Is this a node that isn't reachable on disk yet?
872          *
873          * Nodes that aren't reachable yet have writes blocked until they're
874          * reachable - now that we've cancelled any pending writes and moved
875          * things waiting on that write to wait on this update, we can drop this
876          * node from the list of nodes that the other update is making
877          * reachable, prior to freeing it:
878          */
879         btree_update_drop_new_node(c, b);
880
881         btree_update_will_delete_key(as, &b->key);
882 }
883
884 void bch2_btree_update_done(struct btree_update *as)
885 {
886         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
887
888         bch2_btree_reserve_put(as);
889
890         continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
891 }
892
893 struct btree_update *
894 bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
895                         unsigned nr_nodes, unsigned flags,
896                         struct closure *cl)
897 {
898         struct bch_fs *c = trans->c;
899         struct btree_update *as;
900         int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
901                 ? BCH_DISK_RESERVATION_NOFAIL : 0;
902         int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
903                 ? JOURNAL_RES_GET_RECLAIM : 0;
904         int ret = 0;
905
906         /*
907          * This check isn't necessary for correctness - it's just to potentially
908          * prevent us from doing a lot of work that'll end up being wasted:
909          */
910         ret = bch2_journal_error(&c->journal);
911         if (ret)
912                 return ERR_PTR(ret);
913
914         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
915         memset(as, 0, sizeof(*as));
916         closure_init(&as->cl, NULL);
917         as->c           = c;
918         as->mode        = BTREE_INTERIOR_NO_UPDATE;
919         as->btree_id    = id;
920         INIT_LIST_HEAD(&as->list);
921         INIT_LIST_HEAD(&as->unwritten_list);
922         INIT_LIST_HEAD(&as->write_blocked_list);
923         bch2_keylist_init(&as->old_keys, as->_old_keys);
924         bch2_keylist_init(&as->new_keys, as->_new_keys);
925         bch2_keylist_init(&as->parent_keys, as->inline_keys);
926
927         ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
928                                       BTREE_UPDATE_JOURNAL_RES,
929                                       journal_flags|JOURNAL_RES_GET_NONBLOCK);
930         if (ret == -EAGAIN) {
931                 if (flags & BTREE_INSERT_NOUNLOCK)
932                         return ERR_PTR(-EINTR);
933
934                 bch2_trans_unlock(trans);
935
936                 ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
937                                 BTREE_UPDATE_JOURNAL_RES,
938                                 journal_flags);
939                 if (ret)
940                         return ERR_PTR(ret);
941
942                 if (!bch2_trans_relock(trans)) {
943                         ret = -EINTR;
944                         goto err;
945                 }
946         }
947
948         ret = bch2_disk_reservation_get(c, &as->disk_res,
949                         nr_nodes * c->opts.btree_node_size,
950                         c->opts.metadata_replicas,
951                         disk_res_flags);
952         if (ret)
953                 goto err;
954
955         ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl);
956         if (ret)
957                 goto err;
958
959         mutex_lock(&c->btree_interior_update_lock);
960         list_add_tail(&as->list, &c->btree_interior_update_list);
961         mutex_unlock(&c->btree_interior_update_lock);
962
963         return as;
964 err:
965         bch2_btree_update_free(as);
966         return ERR_PTR(ret);
967 }
968
969 /* Btree root updates: */
970
971 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
972 {
973         /* Root nodes cannot be reaped */
974         mutex_lock(&c->btree_cache.lock);
975         list_del_init(&b->list);
976         mutex_unlock(&c->btree_cache.lock);
977
978         mutex_lock(&c->btree_root_lock);
979         BUG_ON(btree_node_root(c, b) &&
980                (b->c.level < btree_node_root(c, b)->c.level ||
981                 !btree_node_dying(btree_node_root(c, b))));
982
983         btree_node_root(c, b) = b;
984         mutex_unlock(&c->btree_root_lock);
985
986         bch2_recalc_btree_reserve(c);
987 }
988
989 /**
990  * bch_btree_set_root - update the root in memory and on disk
991  *
992  * To ensure forward progress, the current task must not be holding any
993  * btree node write locks. However, you must hold an intent lock on the
994  * old root.
995  *
996  * Note: This allocates a journal entry but doesn't add any keys to
997  * it.  All the btree roots are part of every journal write, so there
998  * is nothing new to be done.  This just guarantees that there is a
999  * journal write.
1000  */
1001 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
1002                                 struct btree_iter *iter)
1003 {
1004         struct bch_fs *c = as->c;
1005         struct btree *old;
1006
1007         trace_btree_set_root(c, b);
1008         BUG_ON(!b->written &&
1009                !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
1010
1011         old = btree_node_root(c, b);
1012
1013         /*
1014          * Ensure no one is using the old root while we switch to the
1015          * new root:
1016          */
1017         bch2_btree_node_lock_write(old, iter);
1018
1019         bch2_btree_set_root_inmem(c, b);
1020
1021         btree_update_updated_root(as, b);
1022
1023         /*
1024          * Unlock old root after new root is visible:
1025          *
1026          * The new root isn't persistent, but that's ok: we still have
1027          * an intent lock on the new root, and any updates that would
1028          * depend on the new root would have to update the new root.
1029          */
1030         bch2_btree_node_unlock_write(old, iter);
1031 }
1032
1033 /* Interior node updates: */
1034
1035 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1036                                         struct btree_iter *iter,
1037                                         struct bkey_i *insert,
1038                                         struct btree_node_iter *node_iter)
1039 {
1040         struct bch_fs *c = as->c;
1041         struct bkey_packed *k;
1042         const char *invalid;
1043
1044         invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
1045                 bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
1046         if (invalid) {
1047                 char buf[160];
1048
1049                 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
1050                 bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
1051                 dump_stack();
1052         }
1053
1054         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
1055                ARRAY_SIZE(as->journal_entries));
1056
1057         as->journal_u64s +=
1058                 journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
1059                                   BCH_JSET_ENTRY_btree_keys,
1060                                   b->c.btree_id, b->c.level,
1061                                   insert, insert->k.u64s);
1062
1063         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1064                bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1065                 bch2_btree_node_iter_advance(node_iter, b);
1066
1067         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1068         set_btree_node_dirty(c, b);
1069         set_btree_node_need_write(b);
1070 }
1071
1072 /*
1073  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1074  * node)
1075  */
1076 static struct btree *__btree_split_node(struct btree_update *as,
1077                                         struct btree *n1,
1078                                         struct btree_iter *iter)
1079 {
1080         size_t nr_packed = 0, nr_unpacked = 0;
1081         struct btree *n2;
1082         struct bset *set1, *set2;
1083         struct bkey_packed *k, *prev = NULL;
1084
1085         n2 = bch2_btree_node_alloc(as, n1->c.level);
1086         bch2_btree_update_add_new_node(as, n2);
1087
1088         n2->data->max_key       = n1->data->max_key;
1089         n2->data->format        = n1->format;
1090         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1091         n2->key.k.p = n1->key.k.p;
1092
1093         btree_node_set_format(n2, n2->data->format);
1094
1095         set1 = btree_bset_first(n1);
1096         set2 = btree_bset_first(n2);
1097
1098         /*
1099          * Has to be a linear search because we don't have an auxiliary
1100          * search tree yet
1101          */
1102         k = set1->start;
1103         while (1) {
1104                 struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
1105
1106                 if (n == vstruct_last(set1))
1107                         break;
1108                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1109                         break;
1110
1111                 if (bkey_packed(k))
1112                         nr_packed++;
1113                 else
1114                         nr_unpacked++;
1115
1116                 prev = k;
1117                 k = n;
1118         }
1119
1120         BUG_ON(!prev);
1121
1122         btree_set_max(n1, bkey_unpack_pos(n1, prev));
1123         btree_set_min(n2, bkey_successor(n1->key.k.p));
1124
1125         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1126         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1127
1128         set_btree_bset_end(n1, n1->set);
1129         set_btree_bset_end(n2, n2->set);
1130
1131         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1132         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1133         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1134         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1135
1136         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1137         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1138         n1->nr.packed_keys      = nr_packed;
1139         n1->nr.unpacked_keys    = nr_unpacked;
1140
1141         BUG_ON(!set1->u64s);
1142         BUG_ON(!set2->u64s);
1143
1144         memcpy_u64s(set2->start,
1145                     vstruct_end(set1),
1146                     le16_to_cpu(set2->u64s));
1147
1148         btree_node_reset_sib_u64s(n1);
1149         btree_node_reset_sib_u64s(n2);
1150
1151         bch2_verify_btree_nr_keys(n1);
1152         bch2_verify_btree_nr_keys(n2);
1153
1154         if (n1->c.level) {
1155                 btree_node_interior_verify(as->c, n1);
1156                 btree_node_interior_verify(as->c, n2);
1157         }
1158
1159         return n2;
1160 }
1161
1162 /*
1163  * For updates to interior nodes, we've got to do the insert before we split
1164  * because the stuff we're inserting has to be inserted atomically. Post split,
1165  * the keys might have to go in different nodes and the split would no longer be
1166  * atomic.
1167  *
1168  * Worse, if the insert is from btree node coalescing, if we do the insert after
1169  * we do the split (and pick the pivot) - the pivot we pick might be between
1170  * nodes that were coalesced, and thus in the middle of a child node post
1171  * coalescing:
1172  */
1173 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1174                                     struct btree_iter *iter,
1175                                     struct keylist *keys)
1176 {
1177         struct btree_node_iter node_iter;
1178         struct bkey_i *k = bch2_keylist_front(keys);
1179         struct bkey_packed *src, *dst, *n;
1180         struct bset *i;
1181
1182         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1183
1184         bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1185
1186         while (!bch2_keylist_empty(keys)) {
1187                 k = bch2_keylist_front(keys);
1188
1189                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1190                 bch2_keylist_pop_front(keys);
1191         }
1192
1193         /*
1194          * We can't tolerate whiteouts here - with whiteouts there can be
1195          * duplicate keys, and it would be rather bad if we picked a duplicate
1196          * for the pivot:
1197          */
1198         i = btree_bset_first(b);
1199         src = dst = i->start;
1200         while (src != vstruct_last(i)) {
1201                 n = bkey_next_skip_noops(src, vstruct_last(i));
1202                 if (!bkey_deleted(src)) {
1203                         memmove_u64s_down(dst, src, src->u64s);
1204                         dst = bkey_next(dst);
1205                 }
1206                 src = n;
1207         }
1208
1209         i->u64s = cpu_to_le16((u64 *) dst - i->_data);
1210         set_btree_bset_end(b, b->set);
1211
1212         BUG_ON(b->nsets != 1 ||
1213                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1214
1215         btree_node_interior_verify(as->c, b);
1216 }
1217
1218 static void btree_split(struct btree_update *as, struct btree *b,
1219                         struct btree_iter *iter, struct keylist *keys,
1220                         unsigned flags)
1221 {
1222         struct bch_fs *c = as->c;
1223         struct btree *parent = btree_node_parent(iter, b);
1224         struct btree *n1, *n2 = NULL, *n3 = NULL;
1225         u64 start_time = local_clock();
1226
1227         BUG_ON(!parent && (b != btree_node_root(c, b)));
1228         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
1229
1230         bch2_btree_interior_update_will_free_node(as, b);
1231
1232         n1 = bch2_btree_node_alloc_replacement(as, b);
1233         bch2_btree_update_add_new_node(as, n1);
1234
1235         if (keys)
1236                 btree_split_insert_keys(as, n1, iter, keys);
1237
1238         if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1239                 trace_btree_split(c, b);
1240
1241                 n2 = __btree_split_node(as, n1, iter);
1242
1243                 bch2_btree_build_aux_trees(n2);
1244                 bch2_btree_build_aux_trees(n1);
1245                 six_unlock_write(&n2->c.lock);
1246                 six_unlock_write(&n1->c.lock);
1247
1248                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1249
1250                 /*
1251                  * Note that on recursive parent_keys == keys, so we
1252                  * can't start adding new keys to parent_keys before emptying it
1253                  * out (which we did with btree_split_insert_keys() above)
1254                  */
1255                 bch2_keylist_add(&as->parent_keys, &n1->key);
1256                 bch2_keylist_add(&as->parent_keys, &n2->key);
1257
1258                 if (!parent) {
1259                         /* Depth increases, make a new root */
1260                         n3 = __btree_root_alloc(as, b->c.level + 1);
1261
1262                         n3->sib_u64s[0] = U16_MAX;
1263                         n3->sib_u64s[1] = U16_MAX;
1264
1265                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1266
1267                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1268                 }
1269         } else {
1270                 trace_btree_compact(c, b);
1271
1272                 bch2_btree_build_aux_trees(n1);
1273                 six_unlock_write(&n1->c.lock);
1274
1275                 if (parent)
1276                         bch2_keylist_add(&as->parent_keys, &n1->key);
1277         }
1278
1279         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1280
1281         /* New nodes all written, now make them visible: */
1282
1283         if (parent) {
1284                 /* Split a non root node */
1285                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1286         } else if (n3) {
1287                 bch2_btree_set_root(as, n3, iter);
1288         } else {
1289                 /* Root filled up but didn't need to be split */
1290                 bch2_btree_set_root(as, n1, iter);
1291         }
1292
1293         bch2_btree_update_get_open_buckets(as, n1);
1294         if (n2)
1295                 bch2_btree_update_get_open_buckets(as, n2);
1296         if (n3)
1297                 bch2_btree_update_get_open_buckets(as, n3);
1298
1299         /* Successful split, update the iterator to point to the new nodes: */
1300
1301         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1302         bch2_btree_iter_node_drop(iter, b);
1303         if (n3)
1304                 bch2_btree_iter_node_replace(iter, n3);
1305         if (n2)
1306                 bch2_btree_iter_node_replace(iter, n2);
1307         bch2_btree_iter_node_replace(iter, n1);
1308
1309         /*
1310          * The old node must be freed (in memory) _before_ unlocking the new
1311          * nodes - else another thread could re-acquire a read lock on the old
1312          * node after another thread has locked and updated the new node, thus
1313          * seeing stale data:
1314          */
1315         bch2_btree_node_free_inmem(c, b, iter);
1316
1317         if (n3)
1318                 six_unlock_intent(&n3->c.lock);
1319         if (n2)
1320                 six_unlock_intent(&n2->c.lock);
1321         six_unlock_intent(&n1->c.lock);
1322
1323         bch2_btree_trans_verify_locks(iter->trans);
1324
1325         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
1326                                start_time);
1327 }
1328
1329 static void
1330 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1331                                 struct btree_iter *iter, struct keylist *keys)
1332 {
1333         struct btree_iter *linked;
1334         struct btree_node_iter node_iter;
1335         struct bkey_i *insert = bch2_keylist_front(keys);
1336         struct bkey_packed *k;
1337
1338         /* Don't screw up @iter's position: */
1339         node_iter = iter->l[b->c.level].iter;
1340
1341         /*
1342          * btree_split(), btree_gc_coalesce() will insert keys before
1343          * the iterator's current position - they know the keys go in
1344          * the node the iterator points to:
1345          */
1346         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1347                (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
1348                 ;
1349
1350         for_each_keylist_key(keys, insert)
1351                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1352
1353         btree_update_updated_node(as, b);
1354
1355         trans_for_each_iter_with_node(iter->trans, b, linked)
1356                 bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1357
1358         bch2_btree_trans_verify_iters(iter->trans, b);
1359 }
1360
1361 /**
1362  * bch_btree_insert_node - insert bkeys into a given btree node
1363  *
1364  * @iter:               btree iterator
1365  * @keys:               list of keys to insert
1366  * @hook:               insert callback
1367  * @persistent:         if not null, @persistent will wait on journal write
1368  *
1369  * Inserts as many keys as it can into a given btree node, splitting it if full.
1370  * If a split occurred, this function will return early. This can only happen
1371  * for leaf nodes -- inserts into interior nodes have to be atomic.
1372  */
1373 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1374                             struct btree_iter *iter, struct keylist *keys,
1375                             unsigned flags)
1376 {
1377         struct bch_fs *c = as->c;
1378         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1379         int old_live_u64s = b->nr.live_u64s;
1380         int live_u64s_added, u64s_added;
1381
1382         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
1383         BUG_ON(!b->c.level);
1384         BUG_ON(!as || as->b);
1385         bch2_verify_keylist_sorted(keys);
1386
1387         bch2_btree_node_lock_for_insert(c, b, iter);
1388
1389         if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
1390                 bch2_btree_node_unlock_write(b, iter);
1391                 goto split;
1392         }
1393
1394         btree_node_interior_verify(c, b);
1395
1396         bch2_btree_insert_keys_interior(as, b, iter, keys);
1397
1398         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1399         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1400
1401         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1402                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1403         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1404                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1405
1406         if (u64s_added > live_u64s_added &&
1407             bch2_maybe_compact_whiteouts(c, b))
1408                 bch2_btree_iter_reinit_node(iter, b);
1409
1410         bch2_btree_node_unlock_write(b, iter);
1411
1412         btree_node_interior_verify(c, b);
1413
1414         /*
1415          * when called from the btree_split path the new nodes aren't added to
1416          * the btree iterator yet, so the merge path's unlock/wait/relock dance
1417          * won't work:
1418          */
1419         bch2_foreground_maybe_merge(c, iter, b->c.level,
1420                                     flags|BTREE_INSERT_NOUNLOCK);
1421         return;
1422 split:
1423         btree_split(as, b, iter, keys, flags);
1424 }
1425
1426 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1427                           unsigned flags)
1428 {
1429         struct btree_trans *trans = iter->trans;
1430         struct btree *b = iter_l(iter)->b;
1431         struct btree_update *as;
1432         struct closure cl;
1433         int ret = 0;
1434         struct btree_insert_entry *i;
1435
1436         /*
1437          * We already have a disk reservation and open buckets pinned; this
1438          * allocation must not block:
1439          */
1440         trans_for_each_update(trans, i)
1441                 if (btree_node_type_needs_gc(i->iter->btree_id))
1442                         flags |= BTREE_INSERT_USE_RESERVE;
1443
1444         closure_init_stack(&cl);
1445
1446         /* Hack, because gc and splitting nodes doesn't mix yet: */
1447         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1448             !down_read_trylock(&c->gc_lock)) {
1449                 if (flags & BTREE_INSERT_NOUNLOCK) {
1450                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
1451                         return -EINTR;
1452                 }
1453
1454                 bch2_trans_unlock(trans);
1455                 down_read(&c->gc_lock);
1456
1457                 if (!bch2_trans_relock(trans))
1458                         ret = -EINTR;
1459         }
1460
1461         /*
1462          * XXX: figure out how far we might need to split,
1463          * instead of locking/reserving all the way to the root:
1464          */
1465         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1466                 trace_trans_restart_iter_upgrade(trans->ip);
1467                 ret = -EINTR;
1468                 goto out;
1469         }
1470
1471         as = bch2_btree_update_start(trans, iter->btree_id,
1472                 btree_update_reserve_required(c, b), flags,
1473                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1474         if (IS_ERR(as)) {
1475                 ret = PTR_ERR(as);
1476                 if (ret == -EAGAIN) {
1477                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1478                         bch2_trans_unlock(trans);
1479                         ret = -EINTR;
1480
1481                         trace_transaction_restart_ip(trans->ip, _THIS_IP_);
1482                 }
1483                 goto out;
1484         }
1485
1486         btree_split(as, b, iter, NULL, flags);
1487         bch2_btree_update_done(as);
1488
1489         /*
1490          * We haven't successfully inserted yet, so don't downgrade all the way
1491          * back to read locks;
1492          */
1493         __bch2_btree_iter_downgrade(iter, 1);
1494 out:
1495         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1496                 up_read(&c->gc_lock);
1497         closure_sync(&cl);
1498         return ret;
1499 }
1500
1501 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1502                                    struct btree_iter *iter,
1503                                    unsigned level,
1504                                    unsigned flags,
1505                                    enum btree_node_sibling sib)
1506 {
1507         struct btree_trans *trans = iter->trans;
1508         struct btree_update *as;
1509         struct bkey_format_state new_s;
1510         struct bkey_format new_f;
1511         struct bkey_i delete;
1512         struct btree *b, *m, *n, *prev, *next, *parent;
1513         struct closure cl;
1514         size_t sib_u64s;
1515         int ret = 0;
1516
1517         BUG_ON(!btree_node_locked(iter, level));
1518
1519         closure_init_stack(&cl);
1520 retry:
1521         BUG_ON(!btree_node_locked(iter, level));
1522
1523         b = iter->l[level].b;
1524
1525         parent = btree_node_parent(iter, b);
1526         if (!parent)
1527                 goto out;
1528
1529         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1530                 goto out;
1531
1532         /* XXX: can't be holding read locks */
1533         m = bch2_btree_node_get_sibling(c, iter, b, sib);
1534         if (IS_ERR(m)) {
1535                 ret = PTR_ERR(m);
1536                 goto err;
1537         }
1538
1539         /* NULL means no sibling: */
1540         if (!m) {
1541                 b->sib_u64s[sib] = U16_MAX;
1542                 goto out;
1543         }
1544
1545         if (sib == btree_prev_sib) {
1546                 prev = m;
1547                 next = b;
1548         } else {
1549                 prev = b;
1550                 next = m;
1551         }
1552
1553         bch2_bkey_format_init(&new_s);
1554         __bch2_btree_calc_format(&new_s, b);
1555         __bch2_btree_calc_format(&new_s, m);
1556         new_f = bch2_bkey_format_done(&new_s);
1557
1558         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1559                 btree_node_u64s_with_format(m, &new_f);
1560
1561         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1562                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1563                 sib_u64s /= 2;
1564                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1565         }
1566
1567         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1568         b->sib_u64s[sib] = sib_u64s;
1569
1570         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1571                 six_unlock_intent(&m->c.lock);
1572                 goto out;
1573         }
1574
1575         /* We're changing btree topology, doesn't mix with gc: */
1576         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1577             !down_read_trylock(&c->gc_lock))
1578                 goto err_cycle_gc_lock;
1579
1580         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1581                 ret = -EINTR;
1582                 goto err_unlock;
1583         }
1584
1585         as = bch2_btree_update_start(trans, iter->btree_id,
1586                          btree_update_reserve_required(c, parent) + 1,
1587                          flags|
1588                          BTREE_INSERT_NOFAIL|
1589                          BTREE_INSERT_USE_RESERVE,
1590                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1591         if (IS_ERR(as)) {
1592                 ret = PTR_ERR(as);
1593                 goto err_unlock;
1594         }
1595
1596         trace_btree_merge(c, b);
1597
1598         bch2_btree_interior_update_will_free_node(as, b);
1599         bch2_btree_interior_update_will_free_node(as, m);
1600
1601         n = bch2_btree_node_alloc(as, b->c.level);
1602         bch2_btree_update_add_new_node(as, n);
1603
1604         btree_set_min(n, prev->data->min_key);
1605         btree_set_max(n, next->data->max_key);
1606         n->data->format         = new_f;
1607
1608         btree_node_set_format(n, new_f);
1609
1610         bch2_btree_sort_into(c, n, prev);
1611         bch2_btree_sort_into(c, n, next);
1612
1613         bch2_btree_build_aux_trees(n);
1614         six_unlock_write(&n->c.lock);
1615
1616         bkey_init(&delete.k);
1617         delete.k.p = prev->key.k.p;
1618         bch2_keylist_add(&as->parent_keys, &delete);
1619         bch2_keylist_add(&as->parent_keys, &n->key);
1620
1621         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1622
1623         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1624
1625         bch2_btree_update_get_open_buckets(as, n);
1626
1627         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1628         bch2_btree_iter_node_drop(iter, b);
1629         bch2_btree_iter_node_drop(iter, m);
1630
1631         bch2_btree_iter_node_replace(iter, n);
1632
1633         bch2_btree_trans_verify_iters(trans, n);
1634
1635         bch2_btree_node_free_inmem(c, b, iter);
1636         bch2_btree_node_free_inmem(c, m, iter);
1637
1638         six_unlock_intent(&n->c.lock);
1639
1640         bch2_btree_update_done(as);
1641
1642         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1643                 up_read(&c->gc_lock);
1644 out:
1645         bch2_btree_trans_verify_locks(trans);
1646
1647         /*
1648          * Don't downgrade locks here: we're called after successful insert,
1649          * and the caller will downgrade locks after a successful insert
1650          * anyways (in case e.g. a split was required first)
1651          *
1652          * And we're also called when inserting into interior nodes in the
1653          * split path, and downgrading to read locks in there is potentially
1654          * confusing:
1655          */
1656         closure_sync(&cl);
1657         return;
1658
1659 err_cycle_gc_lock:
1660         six_unlock_intent(&m->c.lock);
1661
1662         if (flags & BTREE_INSERT_NOUNLOCK)
1663                 goto out;
1664
1665         bch2_trans_unlock(trans);
1666
1667         down_read(&c->gc_lock);
1668         up_read(&c->gc_lock);
1669         ret = -EINTR;
1670         goto err;
1671
1672 err_unlock:
1673         six_unlock_intent(&m->c.lock);
1674         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1675                 up_read(&c->gc_lock);
1676 err:
1677         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1678
1679         if ((ret == -EAGAIN || ret == -EINTR) &&
1680             !(flags & BTREE_INSERT_NOUNLOCK)) {
1681                 bch2_trans_unlock(trans);
1682                 closure_sync(&cl);
1683                 ret = bch2_btree_iter_traverse(iter);
1684                 if (ret)
1685                         goto out;
1686
1687                 goto retry;
1688         }
1689
1690         goto out;
1691 }
1692
1693 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1694                                 struct btree *b, unsigned flags,
1695                                 struct closure *cl)
1696 {
1697         struct btree *n, *parent = btree_node_parent(iter, b);
1698         struct btree_update *as;
1699
1700         as = bch2_btree_update_start(iter->trans, iter->btree_id,
1701                 (parent
1702                  ? btree_update_reserve_required(c, parent)
1703                  : 0) + 1,
1704                 flags, cl);
1705         if (IS_ERR(as)) {
1706                 trace_btree_gc_rewrite_node_fail(c, b);
1707                 return PTR_ERR(as);
1708         }
1709
1710         bch2_btree_interior_update_will_free_node(as, b);
1711
1712         n = bch2_btree_node_alloc_replacement(as, b);
1713         bch2_btree_update_add_new_node(as, n);
1714
1715         bch2_btree_build_aux_trees(n);
1716         six_unlock_write(&n->c.lock);
1717
1718         trace_btree_gc_rewrite_node(c, b);
1719
1720         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1721
1722         if (parent) {
1723                 bch2_keylist_add(&as->parent_keys, &n->key);
1724                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1725         } else {
1726                 bch2_btree_set_root(as, n, iter);
1727         }
1728
1729         bch2_btree_update_get_open_buckets(as, n);
1730
1731         six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1732         bch2_btree_iter_node_drop(iter, b);
1733         bch2_btree_iter_node_replace(iter, n);
1734         bch2_btree_node_free_inmem(c, b, iter);
1735         six_unlock_intent(&n->c.lock);
1736
1737         bch2_btree_update_done(as);
1738         return 0;
1739 }
1740
1741 /**
1742  * bch_btree_node_rewrite - Rewrite/move a btree node
1743  *
1744  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1745  * btree_check_reserve() has to wait)
1746  */
1747 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1748                             __le64 seq, unsigned flags)
1749 {
1750         struct btree_trans *trans = iter->trans;
1751         struct closure cl;
1752         struct btree *b;
1753         int ret;
1754
1755         flags |= BTREE_INSERT_NOFAIL;
1756
1757         closure_init_stack(&cl);
1758
1759         bch2_btree_iter_upgrade(iter, U8_MAX);
1760
1761         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1762                 if (!down_read_trylock(&c->gc_lock)) {
1763                         bch2_trans_unlock(trans);
1764                         down_read(&c->gc_lock);
1765                 }
1766         }
1767
1768         while (1) {
1769                 ret = bch2_btree_iter_traverse(iter);
1770                 if (ret)
1771                         break;
1772
1773                 b = bch2_btree_iter_peek_node(iter);
1774                 if (!b || b->data->keys.seq != seq)
1775                         break;
1776
1777                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1778                 if (ret != -EAGAIN &&
1779                     ret != -EINTR)
1780                         break;
1781
1782                 bch2_trans_unlock(trans);
1783                 closure_sync(&cl);
1784         }
1785
1786         bch2_btree_iter_downgrade(iter);
1787
1788         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1789                 up_read(&c->gc_lock);
1790
1791         closure_sync(&cl);
1792         return ret;
1793 }
1794
1795 static void __bch2_btree_node_update_key(struct bch_fs *c,
1796                                          struct btree_update *as,
1797                                          struct btree_iter *iter,
1798                                          struct btree *b, struct btree *new_hash,
1799                                          struct bkey_i *new_key)
1800 {
1801         struct btree *parent;
1802         int ret;
1803
1804         btree_update_will_delete_key(as, &b->key);
1805         btree_update_will_add_key(as, new_key);
1806
1807         parent = btree_node_parent(iter, b);
1808         if (parent) {
1809                 if (new_hash) {
1810                         bkey_copy(&new_hash->key, new_key);
1811                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
1812                                         new_hash, b->c.level, b->c.btree_id);
1813                         BUG_ON(ret);
1814                 }
1815
1816                 bch2_keylist_add(&as->parent_keys, new_key);
1817                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
1818
1819                 if (new_hash) {
1820                         mutex_lock(&c->btree_cache.lock);
1821                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
1822
1823                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1824
1825                         bkey_copy(&b->key, new_key);
1826                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1827                         BUG_ON(ret);
1828                         mutex_unlock(&c->btree_cache.lock);
1829                 } else {
1830                         bkey_copy(&b->key, new_key);
1831                 }
1832         } else {
1833                 BUG_ON(btree_node_root(c, b) != b);
1834
1835                 bch2_btree_node_lock_write(b, iter);
1836                 bkey_copy(&b->key, new_key);
1837
1838                 if (btree_ptr_hash_val(&b->key) != b->hash_val) {
1839                         mutex_lock(&c->btree_cache.lock);
1840                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1841
1842                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1843                         BUG_ON(ret);
1844                         mutex_unlock(&c->btree_cache.lock);
1845                 }
1846
1847                 btree_update_updated_root(as, b);
1848                 bch2_btree_node_unlock_write(b, iter);
1849         }
1850
1851         bch2_btree_update_done(as);
1852 }
1853
1854 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
1855                                struct btree *b,
1856                                struct bkey_i *new_key)
1857 {
1858         struct btree *parent = btree_node_parent(iter, b);
1859         struct btree_update *as = NULL;
1860         struct btree *new_hash = NULL;
1861         struct closure cl;
1862         int ret;
1863
1864         closure_init_stack(&cl);
1865
1866         if (!bch2_btree_iter_upgrade(iter, U8_MAX))
1867                 return -EINTR;
1868
1869         if (!down_read_trylock(&c->gc_lock)) {
1870                 bch2_trans_unlock(iter->trans);
1871                 down_read(&c->gc_lock);
1872
1873                 if (!bch2_trans_relock(iter->trans)) {
1874                         ret = -EINTR;
1875                         goto err;
1876                 }
1877         }
1878
1879         /*
1880          * check btree_ptr_hash_val() after @b is locked by
1881          * btree_iter_traverse():
1882          */
1883         if (btree_ptr_hash_val(new_key) != b->hash_val) {
1884                 /* bch2_btree_reserve_get will unlock */
1885                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1886                 if (ret) {
1887                         bch2_trans_unlock(iter->trans);
1888                         up_read(&c->gc_lock);
1889                         closure_sync(&cl);
1890                         down_read(&c->gc_lock);
1891
1892                         if (!bch2_trans_relock(iter->trans)) {
1893                                 ret = -EINTR;
1894                                 goto err;
1895                         }
1896                 }
1897
1898                 new_hash = bch2_btree_node_mem_alloc(c);
1899         }
1900 retry:
1901         as = bch2_btree_update_start(iter->trans, iter->btree_id,
1902                 parent ? btree_update_reserve_required(c, parent) : 0,
1903                 BTREE_INSERT_NOFAIL|
1904                 BTREE_INSERT_USE_RESERVE|
1905                 BTREE_INSERT_USE_ALLOC_RESERVE,
1906                 &cl);
1907
1908         if (IS_ERR(as)) {
1909                 ret = PTR_ERR(as);
1910                 if (ret == -EAGAIN)
1911                         ret = -EINTR;
1912
1913                 if (ret == -EINTR) {
1914                         bch2_trans_unlock(iter->trans);
1915                         up_read(&c->gc_lock);
1916                         closure_sync(&cl);
1917                         down_read(&c->gc_lock);
1918
1919                         if (bch2_trans_relock(iter->trans))
1920                                 goto retry;
1921                 }
1922
1923                 goto err;
1924         }
1925
1926         ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
1927         if (ret)
1928                 goto err_free_update;
1929
1930         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
1931
1932         bch2_btree_iter_downgrade(iter);
1933 err:
1934         if (new_hash) {
1935                 mutex_lock(&c->btree_cache.lock);
1936                 list_move(&new_hash->list, &c->btree_cache.freeable);
1937                 mutex_unlock(&c->btree_cache.lock);
1938
1939                 six_unlock_write(&new_hash->c.lock);
1940                 six_unlock_intent(&new_hash->c.lock);
1941         }
1942         up_read(&c->gc_lock);
1943         closure_sync(&cl);
1944         return ret;
1945 err_free_update:
1946         bch2_btree_update_free(as);
1947         goto err;
1948 }
1949
1950 /* Init code: */
1951
1952 /*
1953  * Only for filesystem bringup, when first reading the btree roots or allocating
1954  * btree roots when initializing a new filesystem:
1955  */
1956 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
1957 {
1958         BUG_ON(btree_node_root(c, b));
1959
1960         bch2_btree_set_root_inmem(c, b);
1961 }
1962
1963 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
1964 {
1965         struct closure cl;
1966         struct btree *b;
1967         int ret;
1968
1969         closure_init_stack(&cl);
1970
1971         do {
1972                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1973                 closure_sync(&cl);
1974         } while (ret);
1975
1976         b = bch2_btree_node_mem_alloc(c);
1977         bch2_btree_cache_cannibalize_unlock(c);
1978
1979         set_btree_node_fake(b);
1980         set_btree_node_need_rewrite(b);
1981         b->c.level      = 0;
1982         b->c.btree_id   = id;
1983
1984         bkey_btree_ptr_init(&b->key);
1985         b->key.k.p = POS_MAX;
1986         *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
1987
1988         bch2_bset_init_first(b, &b->data->keys);
1989         bch2_btree_build_aux_trees(b);
1990
1991         b->data->flags = 0;
1992         btree_set_min(b, POS_MIN);
1993         btree_set_max(b, POS_MAX);
1994         b->data->format = bch2_btree_calc_format(b);
1995         btree_node_set_format(b, b->data->format);
1996
1997         ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
1998                                           b->c.level, b->c.btree_id);
1999         BUG_ON(ret);
2000
2001         bch2_btree_set_root_inmem(c, b);
2002
2003         six_unlock_write(&b->c.lock);
2004         six_unlock_intent(&b->c.lock);
2005 }
2006
2007 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
2008 {
2009         struct btree_update *as;
2010
2011         mutex_lock(&c->btree_interior_update_lock);
2012         list_for_each_entry(as, &c->btree_interior_update_list, list)
2013                 pr_buf(out, "%p m %u w %u r %u j %llu\n",
2014                        as,
2015                        as->mode,
2016                        as->nodes_written,
2017                        atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
2018                        as->journal.seq);
2019         mutex_unlock(&c->btree_interior_update_lock);
2020 }
2021
2022 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
2023 {
2024         size_t ret = 0;
2025         struct list_head *i;
2026
2027         mutex_lock(&c->btree_interior_update_lock);
2028         list_for_each(i, &c->btree_interior_update_list)
2029                 ret++;
2030         mutex_unlock(&c->btree_interior_update_lock);
2031
2032         return ret;
2033 }
2034
2035 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
2036 {
2037         struct btree_root *r;
2038         struct jset_entry *entry;
2039
2040         mutex_lock(&c->btree_root_lock);
2041
2042         vstruct_for_each(jset, entry)
2043                 if (entry->type == BCH_JSET_ENTRY_btree_root) {
2044                         r = &c->btree_roots[entry->btree_id];
2045                         r->level = entry->level;
2046                         r->alive = true;
2047                         bkey_copy(&r->key, &entry->start[0]);
2048                 }
2049
2050         mutex_unlock(&c->btree_root_lock);
2051 }
2052
2053 struct jset_entry *
2054 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
2055                                     struct jset_entry *start,
2056                                     struct jset_entry *end)
2057 {
2058         struct jset_entry *entry;
2059         unsigned long have = 0;
2060         unsigned i;
2061
2062         for (entry = start; entry < end; entry = vstruct_next(entry))
2063                 if (entry->type == BCH_JSET_ENTRY_btree_root)
2064                         __set_bit(entry->btree_id, &have);
2065
2066         mutex_lock(&c->btree_root_lock);
2067
2068         for (i = 0; i < BTREE_ID_NR; i++)
2069                 if (c->btree_roots[i].alive && !test_bit(i, &have)) {
2070                         journal_entry_set(end,
2071                                           BCH_JSET_ENTRY_btree_root,
2072                                           i, c->btree_roots[i].level,
2073                                           &c->btree_roots[i].key,
2074                                           c->btree_roots[i].key.u64s);
2075                         end = vstruct_next(end);
2076                 }
2077
2078         mutex_unlock(&c->btree_root_lock);
2079
2080         return end;
2081 }
2082
2083 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
2084 {
2085         if (c->btree_interior_update_worker)
2086                 destroy_workqueue(c->btree_interior_update_worker);
2087         mempool_exit(&c->btree_interior_update_pool);
2088 }
2089
2090 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
2091 {
2092         mutex_init(&c->btree_reserve_cache_lock);
2093         INIT_LIST_HEAD(&c->btree_interior_update_list);
2094         INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
2095         mutex_init(&c->btree_interior_update_lock);
2096         INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
2097
2098         c->btree_interior_update_worker =
2099                 alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
2100         if (!c->btree_interior_update_worker)
2101                 return -ENOMEM;
2102
2103         return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
2104                                          sizeof(struct btree_update));
2105 }