]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
Update bcachefs sources to d9bb516b2d bcachefs: Move extent overwrite handling out...
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_gc.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_io.h"
11 #include "btree_iter.h"
12 #include "btree_locking.h"
13 #include "buckets.h"
14 #include "extents.h"
15 #include "journal.h"
16 #include "journal_reclaim.h"
17 #include "keylist.h"
18 #include "replicas.h"
19 #include "super-io.h"
20
21 #include <linux/random.h>
22 #include <trace/events/bcachefs.h>
23
24 static void btree_node_will_make_reachable(struct btree_update *,
25                                            struct btree *);
26 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
27 static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
28
29 /* Debug code: */
30
31 static void btree_node_interior_verify(struct btree *b)
32 {
33         struct btree_node_iter iter;
34         struct bkey_packed *k;
35
36         BUG_ON(!b->level);
37
38         bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
39 #if 1
40         BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
41                bkey_cmp_left_packed(b, k, &b->key.k.p));
42
43         BUG_ON((bch2_btree_node_iter_advance(&iter, b),
44                 !bch2_btree_node_iter_end(&iter)));
45 #else
46         const char *msg;
47
48         msg = "not found";
49         k = bch2_btree_node_iter_peek(&iter, b);
50         if (!k)
51                 goto err;
52
53         msg = "isn't what it should be";
54         if (bkey_cmp_left_packed(b, k, &b->key.k.p))
55                 goto err;
56
57         bch2_btree_node_iter_advance(&iter, b);
58
59         msg = "isn't last key";
60         if (!bch2_btree_node_iter_end(&iter))
61                 goto err;
62         return;
63 err:
64         bch2_dump_btree_node(b);
65         printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
66                b->key.k.p.offset, msg);
67         BUG();
68 #endif
69 }
70
71 /* Calculate ideal packed bkey format for new btree nodes: */
72
73 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
74 {
75         struct bkey_packed *k;
76         struct bset_tree *t;
77         struct bkey uk;
78
79         bch2_bkey_format_add_pos(s, b->data->min_key);
80
81         for_each_bset(b, t)
82                 bset_tree_for_each_key(b, t, k)
83                         if (!bkey_whiteout(k)) {
84                                 uk = bkey_unpack_key(b, k);
85                                 bch2_bkey_format_add_key(s, &uk);
86                         }
87 }
88
89 static struct bkey_format bch2_btree_calc_format(struct btree *b)
90 {
91         struct bkey_format_state s;
92
93         bch2_bkey_format_init(&s);
94         __bch2_btree_calc_format(&s, b);
95
96         return bch2_bkey_format_done(&s);
97 }
98
99 static size_t btree_node_u64s_with_format(struct btree *b,
100                                           struct bkey_format *new_f)
101 {
102         struct bkey_format *old_f = &b->format;
103
104         /* stupid integer promotion rules */
105         ssize_t delta =
106             (((int) new_f->key_u64s - old_f->key_u64s) *
107              (int) b->nr.packed_keys) +
108             (((int) new_f->key_u64s - BKEY_U64s) *
109              (int) b->nr.unpacked_keys);
110
111         BUG_ON(delta + b->nr.live_u64s < 0);
112
113         return b->nr.live_u64s + delta;
114 }
115
116 /**
117  * btree_node_format_fits - check if we could rewrite node with a new format
118  *
119  * This assumes all keys can pack with the new format -- it just checks if
120  * the re-packed keys would fit inside the node itself.
121  */
122 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
123                                  struct bkey_format *new_f)
124 {
125         size_t u64s = btree_node_u64s_with_format(b, new_f);
126
127         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
128 }
129
130 /* Btree node freeing/allocation: */
131
132 static bool btree_key_matches(struct bch_fs *c,
133                               struct bkey_s_c l,
134                               struct bkey_s_c r)
135 {
136         struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
137         struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
138         const struct bch_extent_ptr *ptr1, *ptr2;
139
140         bkey_for_each_ptr(ptrs1, ptr1)
141                 bkey_for_each_ptr(ptrs2, ptr2)
142                         if (ptr1->dev == ptr2->dev &&
143                             ptr1->gen == ptr2->gen &&
144                             ptr1->offset == ptr2->offset)
145                                 return true;
146
147         return false;
148 }
149
150 /*
151  * We're doing the index update that makes @b unreachable, update stuff to
152  * reflect that:
153  *
154  * Must be called _before_ btree_update_updated_root() or
155  * btree_update_updated_node:
156  */
157 static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
158                                        struct bkey_s_c k,
159                                        struct bch_fs_usage *stats)
160 {
161         struct bch_fs *c = as->c;
162         struct pending_btree_node_free *d;
163
164         for (d = as->pending; d < as->pending + as->nr_pending; d++)
165                 if (!bkey_cmp(k.k->p, d->key.k.p) &&
166                     btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
167                         goto found;
168         BUG();
169 found:
170         BUG_ON(d->index_update_done);
171         d->index_update_done = true;
172
173         /*
174          * We're dropping @k from the btree, but it's still live until the
175          * index update is persistent so we need to keep a reference around for
176          * mark and sweep to find - that's primarily what the
177          * btree_node_pending_free list is for.
178          *
179          * So here (when we set index_update_done = true), we're moving an
180          * existing reference to a different part of the larger "gc keyspace" -
181          * and the new position comes after the old position, since GC marks
182          * the pending free list after it walks the btree.
183          *
184          * If we move the reference while mark and sweep is _between_ the old
185          * and the new position, mark and sweep will see the reference twice
186          * and it'll get double accounted - so check for that here and subtract
187          * to cancel out one of mark and sweep's markings if necessary:
188          */
189
190         if (gc_pos_cmp(c->gc_pos, b
191                        ? gc_pos_btree_node(b)
192                        : gc_pos_btree_root(as->btree_id)) >= 0 &&
193             gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
194                 bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
195                               0, 0, NULL, 0,
196                               BTREE_TRIGGER_OVERWRITE|
197                               BTREE_TRIGGER_GC);
198 }
199
200 static void __btree_node_free(struct bch_fs *c, struct btree *b)
201 {
202         trace_btree_node_free(c, b);
203
204         BUG_ON(btree_node_dirty(b));
205         BUG_ON(btree_node_need_write(b));
206         BUG_ON(b == btree_node_root(c, b));
207         BUG_ON(b->ob.nr);
208         BUG_ON(!list_empty(&b->write_blocked));
209         BUG_ON(b->will_make_reachable);
210
211         clear_btree_node_noevict(b);
212
213         bch2_btree_node_hash_remove(&c->btree_cache, b);
214
215         mutex_lock(&c->btree_cache.lock);
216         list_move(&b->list, &c->btree_cache.freeable);
217         mutex_unlock(&c->btree_cache.lock);
218 }
219
220 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
221 {
222         struct open_buckets ob = b->ob;
223
224         btree_update_drop_new_node(c, b);
225
226         b->ob.nr = 0;
227
228         clear_btree_node_dirty(b);
229
230         btree_node_lock_type(c, b, SIX_LOCK_write);
231         __btree_node_free(c, b);
232         six_unlock_write(&b->lock);
233
234         bch2_open_buckets_put(c, &ob);
235 }
236
237 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
238                                 struct btree_iter *iter)
239 {
240         struct btree_iter *linked;
241
242         trans_for_each_iter(iter->trans, linked)
243                 BUG_ON(linked->l[b->level].b == b);
244
245         /*
246          * Is this a node that isn't reachable on disk yet?
247          *
248          * Nodes that aren't reachable yet have writes blocked until they're
249          * reachable - now that we've cancelled any pending writes and moved
250          * things waiting on that write to wait on this update, we can drop this
251          * node from the list of nodes that the other update is making
252          * reachable, prior to freeing it:
253          */
254         btree_update_drop_new_node(c, b);
255
256         six_lock_write(&b->lock);
257         __btree_node_free(c, b);
258         six_unlock_write(&b->lock);
259         six_unlock_intent(&b->lock);
260 }
261
262 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
263                                         struct pending_btree_node_free *pending)
264 {
265         BUG_ON(!pending->index_update_done);
266
267         bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
268                       0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
269
270         if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
271                 bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
272                               0, 0, NULL, 0,
273                               BTREE_TRIGGER_OVERWRITE|
274                               BTREE_TRIGGER_GC);
275 }
276
277 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
278                                              struct disk_reservation *res,
279                                              struct closure *cl,
280                                              unsigned flags)
281 {
282         struct write_point *wp;
283         struct btree *b;
284         BKEY_PADDED(k) tmp;
285         struct open_buckets ob = { .nr = 0 };
286         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
287         unsigned nr_reserve;
288         enum alloc_reserve alloc_reserve;
289
290         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
291                 nr_reserve      = 0;
292                 alloc_reserve   = RESERVE_ALLOC;
293         } else if (flags & BTREE_INSERT_USE_RESERVE) {
294                 nr_reserve      = BTREE_NODE_RESERVE / 2;
295                 alloc_reserve   = RESERVE_BTREE;
296         } else {
297                 nr_reserve      = BTREE_NODE_RESERVE;
298                 alloc_reserve   = RESERVE_NONE;
299         }
300
301         mutex_lock(&c->btree_reserve_cache_lock);
302         if (c->btree_reserve_cache_nr > nr_reserve) {
303                 struct btree_alloc *a =
304                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
305
306                 ob = a->ob;
307                 bkey_copy(&tmp.k, &a->k);
308                 mutex_unlock(&c->btree_reserve_cache_lock);
309                 goto mem_alloc;
310         }
311         mutex_unlock(&c->btree_reserve_cache_lock);
312
313 retry:
314         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
315                                       writepoint_ptr(&c->btree_write_point),
316                                       &devs_have,
317                                       res->nr_replicas,
318                                       c->opts.metadata_replicas_required,
319                                       alloc_reserve, 0, cl);
320         if (IS_ERR(wp))
321                 return ERR_CAST(wp);
322
323         if (wp->sectors_free < c->opts.btree_node_size) {
324                 struct open_bucket *ob;
325                 unsigned i;
326
327                 open_bucket_for_each(c, &wp->ptrs, ob, i)
328                         if (ob->sectors_free < c->opts.btree_node_size)
329                                 ob->sectors_free = 0;
330
331                 bch2_alloc_sectors_done(c, wp);
332                 goto retry;
333         }
334
335         if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
336                 bkey_btree_ptr_v2_init(&tmp.k);
337         else
338                 bkey_btree_ptr_init(&tmp.k);
339
340         bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
341
342         bch2_open_bucket_get(c, wp, &ob);
343         bch2_alloc_sectors_done(c, wp);
344 mem_alloc:
345         b = bch2_btree_node_mem_alloc(c);
346
347         /* we hold cannibalize_lock: */
348         BUG_ON(IS_ERR(b));
349         BUG_ON(b->ob.nr);
350
351         bkey_copy(&b->key, &tmp.k);
352         b->ob = ob;
353
354         return b;
355 }
356
357 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
358 {
359         struct bch_fs *c = as->c;
360         struct btree *b;
361         int ret;
362
363         BUG_ON(level >= BTREE_MAX_DEPTH);
364         BUG_ON(!as->reserve->nr);
365
366         b = as->reserve->b[--as->reserve->nr];
367
368         set_btree_node_accessed(b);
369         set_btree_node_dirty(b);
370         set_btree_node_need_write(b);
371
372         bch2_bset_init_first(b, &b->data->keys);
373         b->level        = level;
374         b->btree_id     = as->btree_id;
375
376         memset(&b->nr, 0, sizeof(b->nr));
377         b->data->magic = cpu_to_le64(bset_magic(c));
378         b->data->flags = 0;
379         SET_BTREE_NODE_ID(b->data, as->btree_id);
380         SET_BTREE_NODE_LEVEL(b->data, level);
381         b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
382
383         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
384                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
385
386                 bp->v.mem_ptr           = 0;
387                 bp->v.seq               = b->data->keys.seq;
388                 bp->v.sectors_written   = 0;
389                 bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
390         }
391
392         if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
393                 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
394
395         if (btree_node_is_extents(b) &&
396             !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
397                 set_btree_node_old_extent_overwrite(b);
398
399         bch2_btree_build_aux_trees(b);
400
401         btree_node_will_make_reachable(as, b);
402
403         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
404         BUG_ON(ret);
405
406         trace_btree_node_alloc(c, b);
407         return b;
408 }
409
410 static void btree_set_min(struct btree *b, struct bpos pos)
411 {
412         if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
413                 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
414         b->data->min_key = pos;
415 }
416
417 static void btree_set_max(struct btree *b, struct bpos pos)
418 {
419         b->key.k.p = pos;
420         b->data->max_key = pos;
421 }
422
423 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
424                                                   struct btree *b,
425                                                   struct bkey_format format)
426 {
427         struct btree *n;
428
429         n = bch2_btree_node_alloc(as, b->level);
430
431         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
432
433         btree_set_min(n, b->data->min_key);
434         btree_set_max(n, b->data->max_key);
435
436         n->data->format         = format;
437         btree_node_set_format(n, format);
438
439         bch2_btree_sort_into(as->c, n, b);
440
441         btree_node_reset_sib_u64s(n);
442
443         n->key.k.p = b->key.k.p;
444         return n;
445 }
446
447 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
448                                                        struct btree *b)
449 {
450         struct bkey_format new_f = bch2_btree_calc_format(b);
451
452         /*
453          * The keys might expand with the new format - if they wouldn't fit in
454          * the btree node anymore, use the old format for now:
455          */
456         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
457                 new_f = b->format;
458
459         return __bch2_btree_node_alloc_replacement(as, b, new_f);
460 }
461
462 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
463 {
464         struct btree *b = bch2_btree_node_alloc(as, level);
465
466         btree_set_min(b, POS_MIN);
467         btree_set_max(b, POS_MAX);
468         b->data->format = bch2_btree_calc_format(b);
469
470         btree_node_set_format(b, b->data->format);
471         bch2_btree_build_aux_trees(b);
472
473         six_unlock_write(&b->lock);
474
475         return b;
476 }
477
478 static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
479 {
480         bch2_disk_reservation_put(c, &reserve->disk_res);
481
482         mutex_lock(&c->btree_reserve_cache_lock);
483
484         while (reserve->nr) {
485                 struct btree *b = reserve->b[--reserve->nr];
486
487                 six_unlock_write(&b->lock);
488
489                 if (c->btree_reserve_cache_nr <
490                     ARRAY_SIZE(c->btree_reserve_cache)) {
491                         struct btree_alloc *a =
492                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
493
494                         a->ob = b->ob;
495                         b->ob.nr = 0;
496                         bkey_copy(&a->k, &b->key);
497                 } else {
498                         bch2_open_buckets_put(c, &b->ob);
499                 }
500
501                 btree_node_lock_type(c, b, SIX_LOCK_write);
502                 __btree_node_free(c, b);
503                 six_unlock_write(&b->lock);
504
505                 six_unlock_intent(&b->lock);
506         }
507
508         mutex_unlock(&c->btree_reserve_cache_lock);
509
510         mempool_free(reserve, &c->btree_reserve_pool);
511 }
512
513 static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
514                                                     unsigned nr_nodes,
515                                                     unsigned flags,
516                                                     struct closure *cl)
517 {
518         struct btree_reserve *reserve;
519         struct btree *b;
520         struct disk_reservation disk_res = { 0, 0 };
521         unsigned sectors = nr_nodes * c->opts.btree_node_size;
522         int ret, disk_res_flags = 0;
523
524         if (flags & BTREE_INSERT_NOFAIL)
525                 disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
526
527         /*
528          * This check isn't necessary for correctness - it's just to potentially
529          * prevent us from doing a lot of work that'll end up being wasted:
530          */
531         ret = bch2_journal_error(&c->journal);
532         if (ret)
533                 return ERR_PTR(ret);
534
535         if (bch2_disk_reservation_get(c, &disk_res, sectors,
536                                       c->opts.metadata_replicas,
537                                       disk_res_flags))
538                 return ERR_PTR(-ENOSPC);
539
540         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
541
542         /*
543          * Protects reaping from the btree node cache and using the btree node
544          * open bucket reserve:
545          */
546         ret = bch2_btree_cache_cannibalize_lock(c, cl);
547         if (ret) {
548                 bch2_disk_reservation_put(c, &disk_res);
549                 return ERR_PTR(ret);
550         }
551
552         reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
553
554         reserve->disk_res = disk_res;
555         reserve->nr = 0;
556
557         while (reserve->nr < nr_nodes) {
558                 b = __bch2_btree_node_alloc(c, &disk_res,
559                                             flags & BTREE_INSERT_NOWAIT
560                                             ? NULL : cl, flags);
561                 if (IS_ERR(b)) {
562                         ret = PTR_ERR(b);
563                         goto err_free;
564                 }
565
566                 ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
567                 if (ret)
568                         goto err_free;
569
570                 reserve->b[reserve->nr++] = b;
571         }
572
573         bch2_btree_cache_cannibalize_unlock(c);
574         return reserve;
575 err_free:
576         bch2_btree_reserve_put(c, reserve);
577         bch2_btree_cache_cannibalize_unlock(c);
578         trace_btree_reserve_get_fail(c, nr_nodes, cl);
579         return ERR_PTR(ret);
580 }
581
582 /* Asynchronous interior node update machinery */
583
584 static void bch2_btree_update_free(struct btree_update *as)
585 {
586         struct bch_fs *c = as->c;
587
588         bch2_journal_pin_flush(&c->journal, &as->journal);
589
590         BUG_ON(as->nr_new_nodes);
591         BUG_ON(as->nr_pending);
592
593         if (as->reserve)
594                 bch2_btree_reserve_put(c, as->reserve);
595
596         mutex_lock(&c->btree_interior_update_lock);
597         list_del(&as->list);
598
599         closure_debug_destroy(&as->cl);
600         mempool_free(as, &c->btree_interior_update_pool);
601
602         closure_wake_up(&c->btree_interior_update_wait);
603         mutex_unlock(&c->btree_interior_update_lock);
604 }
605
606 static void btree_update_nodes_reachable(struct closure *cl)
607 {
608         struct btree_update *as = container_of(cl, struct btree_update, cl);
609         struct bch_fs *c = as->c;
610
611         bch2_journal_pin_drop(&c->journal, &as->journal);
612
613         mutex_lock(&c->btree_interior_update_lock);
614
615         while (as->nr_new_nodes) {
616                 struct btree *b = as->new_nodes[--as->nr_new_nodes];
617
618                 BUG_ON(b->will_make_reachable != (unsigned long) as);
619                 b->will_make_reachable = 0;
620                 mutex_unlock(&c->btree_interior_update_lock);
621
622                 /*
623                  * b->will_make_reachable prevented it from being written, so
624                  * write it now if it needs to be written:
625                  */
626                 btree_node_lock_type(c, b, SIX_LOCK_read);
627                 bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
628                 six_unlock_read(&b->lock);
629                 mutex_lock(&c->btree_interior_update_lock);
630         }
631
632         while (as->nr_pending)
633                 bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
634
635         mutex_unlock(&c->btree_interior_update_lock);
636
637         closure_wake_up(&as->wait);
638
639         bch2_btree_update_free(as);
640 }
641
642 static void btree_update_wait_on_journal(struct closure *cl)
643 {
644         struct btree_update *as = container_of(cl, struct btree_update, cl);
645         struct bch_fs *c = as->c;
646         int ret;
647
648         ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
649         if (ret == -EAGAIN) {
650                 continue_at(cl, btree_update_wait_on_journal, system_wq);
651                 return;
652         }
653         if (ret < 0)
654                 goto err;
655
656         bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
657 err:
658         continue_at(cl, btree_update_nodes_reachable, system_wq);
659 }
660
661 static void btree_update_nodes_written(struct closure *cl)
662 {
663         struct btree_update *as = container_of(cl, struct btree_update, cl);
664         struct bch_fs *c = as->c;
665         struct btree *b;
666
667         /*
668          * We did an update to a parent node where the pointers we added pointed
669          * to child nodes that weren't written yet: now, the child nodes have
670          * been written so we can write out the update to the interior node.
671          */
672         mutex_lock(&c->btree_interior_update_lock);
673         as->nodes_written = true;
674 retry:
675         as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
676                                       struct btree_update, unwritten_list);
677         if (!as || !as->nodes_written) {
678                 mutex_unlock(&c->btree_interior_update_lock);
679                 return;
680         }
681
682         switch (as->mode) {
683         case BTREE_INTERIOR_NO_UPDATE:
684                 BUG();
685         case BTREE_INTERIOR_UPDATING_NODE:
686                 /* The usual case: */
687                 b = READ_ONCE(as->b);
688
689                 if (!six_trylock_read(&b->lock)) {
690                         mutex_unlock(&c->btree_interior_update_lock);
691                         btree_node_lock_type(c, b, SIX_LOCK_read);
692                         six_unlock_read(&b->lock);
693                         mutex_lock(&c->btree_interior_update_lock);
694                         goto retry;
695                 }
696
697                 BUG_ON(!btree_node_dirty(b));
698                 closure_wait(&btree_current_write(b)->wait, &as->cl);
699
700                 list_del(&as->write_blocked_list);
701
702                 /*
703                  * for flush_held_btree_writes() waiting on updates to flush or
704                  * nodes to be writeable:
705                  */
706                 closure_wake_up(&c->btree_interior_update_wait);
707
708                 list_del(&as->unwritten_list);
709                 mutex_unlock(&c->btree_interior_update_lock);
710
711                 /*
712                  * b->write_blocked prevented it from being written, so
713                  * write it now if it needs to be written:
714                  */
715                 bch2_btree_node_write_cond(c, b, true);
716                 six_unlock_read(&b->lock);
717                 continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
718                 break;
719
720         case BTREE_INTERIOR_UPDATING_AS:
721                 /*
722                  * The btree node we originally updated has been freed and is
723                  * being rewritten - so we need to write anything here, we just
724                  * need to signal to that btree_update that it's ok to make the
725                  * new replacement node visible:
726                  */
727                 closure_put(&as->parent_as->cl);
728
729                 /*
730                  * and then we have to wait on that btree_update to finish:
731                  */
732                 closure_wait(&as->parent_as->wait, &as->cl);
733
734                 list_del(&as->unwritten_list);
735                 mutex_unlock(&c->btree_interior_update_lock);
736
737                 continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
738                 break;
739
740         case BTREE_INTERIOR_UPDATING_ROOT:
741                 /* b is the new btree root: */
742                 b = READ_ONCE(as->b);
743
744                 if (!six_trylock_read(&b->lock)) {
745                         mutex_unlock(&c->btree_interior_update_lock);
746                         btree_node_lock_type(c, b, SIX_LOCK_read);
747                         six_unlock_read(&b->lock);
748                         mutex_lock(&c->btree_interior_update_lock);
749                         goto retry;
750                 }
751
752                 BUG_ON(c->btree_roots[b->btree_id].as != as);
753                 c->btree_roots[b->btree_id].as = NULL;
754
755                 bch2_btree_set_root_ondisk(c, b, WRITE);
756
757                 /*
758                  * We don't have to wait anything anything here (before
759                  * btree_update_nodes_reachable frees the old nodes
760                  * ondisk) - we've ensured that the very next journal write will
761                  * have the pointer to the new root, and before the allocator
762                  * can reuse the old nodes it'll have to do a journal commit:
763                  */
764                 six_unlock_read(&b->lock);
765
766                 list_del(&as->unwritten_list);
767                 mutex_unlock(&c->btree_interior_update_lock);
768
769                 /*
770                  * Bit of funny circularity going on here we have to break:
771                  *
772                  * We have to drop our journal pin before writing the journal
773                  * entry that points to the new btree root: else, we could
774                  * deadlock if the journal currently happens to be full.
775                  *
776                  * This mean we're dropping the journal pin _before_ the new
777                  * nodes are technically reachable - but this is safe, because
778                  * after the bch2_btree_set_root_ondisk() call above they will
779                  * be reachable as of the very next journal write:
780                  */
781                 bch2_journal_pin_drop(&c->journal, &as->journal);
782
783                 as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
784
785                 btree_update_wait_on_journal(&as->cl);
786                 break;
787         }
788
789         mutex_lock(&c->btree_interior_update_lock);
790         goto retry;
791 }
792
793 /*
794  * We're updating @b with pointers to nodes that haven't finished writing yet:
795  * block @b from being written until @as completes
796  */
797 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
798 {
799         struct bch_fs *c = as->c;
800
801         mutex_lock(&c->btree_interior_update_lock);
802         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
803
804         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
805         BUG_ON(!btree_node_dirty(b));
806
807         as->mode = BTREE_INTERIOR_UPDATING_NODE;
808         as->b = b;
809         list_add(&as->write_blocked_list, &b->write_blocked);
810
811         mutex_unlock(&c->btree_interior_update_lock);
812
813         /*
814          * In general, when you're staging things in a journal that will later
815          * be written elsewhere, and you also want to guarantee ordering: that
816          * is, if you have updates a, b, c, after a crash you should never see c
817          * and not a or b - there's a problem:
818          *
819          * If the final destination of the update(s) (i.e. btree node) can be
820          * written/flushed _before_ the relevant journal entry - oops, that
821          * breaks ordering, since the various leaf nodes can be written in any
822          * order.
823          *
824          * Normally we use bset->journal_seq to deal with this - if during
825          * recovery we find a btree node write that's newer than the newest
826          * journal entry, we just ignore it - we don't need it, anything we're
827          * supposed to have (that we reported as completed via fsync()) will
828          * still be in the journal, and as far as the state of the journal is
829          * concerned that btree node write never happened.
830          *
831          * That breaks when we're rewriting/splitting/merging nodes, since we're
832          * mixing btree node writes that haven't happened yet with previously
833          * written data that has been reported as completed to the journal.
834          *
835          * Thus, before making the new nodes reachable, we have to wait the
836          * newest journal sequence number we have data for to be written (if it
837          * hasn't been yet).
838          */
839         bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
840 }
841
842 static void interior_update_flush(struct journal *j,
843                         struct journal_entry_pin *pin, u64 seq)
844 {
845         struct btree_update *as =
846                 container_of(pin, struct btree_update, journal);
847
848         bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
849 }
850
851 static void btree_update_reparent(struct btree_update *as,
852                                   struct btree_update *child)
853 {
854         struct bch_fs *c = as->c;
855
856         child->b = NULL;
857         child->mode = BTREE_INTERIOR_UPDATING_AS;
858         child->parent_as = as;
859         closure_get(&as->cl);
860
861         /*
862          * When we write a new btree root, we have to drop our journal pin
863          * _before_ the new nodes are technically reachable; see
864          * btree_update_nodes_written().
865          *
866          * This goes for journal pins that are recursively blocked on us - so,
867          * just transfer the journal pin to the new interior update so
868          * btree_update_nodes_written() can drop it.
869          */
870         bch2_journal_pin_copy(&c->journal, &as->journal,
871                               &child->journal, interior_update_flush);
872         bch2_journal_pin_drop(&c->journal, &child->journal);
873
874         as->journal_seq = max(as->journal_seq, child->journal_seq);
875 }
876
877 static void btree_update_updated_root(struct btree_update *as)
878 {
879         struct bch_fs *c = as->c;
880         struct btree_root *r = &c->btree_roots[as->btree_id];
881
882         mutex_lock(&c->btree_interior_update_lock);
883         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
884
885         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
886
887         /*
888          * Old root might not be persistent yet - if so, redirect its
889          * btree_update operation to point to us:
890          */
891         if (r->as)
892                 btree_update_reparent(as, r->as);
893
894         as->mode = BTREE_INTERIOR_UPDATING_ROOT;
895         as->b = r->b;
896         r->as = as;
897
898         mutex_unlock(&c->btree_interior_update_lock);
899
900         /*
901          * When we're rewriting nodes and updating interior nodes, there's an
902          * issue with updates that haven't been written in the journal getting
903          * mixed together with older data - see btree_update_updated_node()
904          * for the explanation.
905          *
906          * However, this doesn't affect us when we're writing a new btree root -
907          * because to make that new root reachable we have to write out a new
908          * journal entry, which must necessarily be newer than as->journal_seq.
909          */
910 }
911
912 static void btree_node_will_make_reachable(struct btree_update *as,
913                                            struct btree *b)
914 {
915         struct bch_fs *c = as->c;
916
917         mutex_lock(&c->btree_interior_update_lock);
918         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
919         BUG_ON(b->will_make_reachable);
920
921         as->new_nodes[as->nr_new_nodes++] = b;
922         b->will_make_reachable = 1UL|(unsigned long) as;
923
924         closure_get(&as->cl);
925         mutex_unlock(&c->btree_interior_update_lock);
926 }
927
928 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
929 {
930         struct btree_update *as;
931         unsigned long v;
932         unsigned i;
933
934         mutex_lock(&c->btree_interior_update_lock);
935         v = xchg(&b->will_make_reachable, 0);
936         as = (struct btree_update *) (v & ~1UL);
937
938         if (!as) {
939                 mutex_unlock(&c->btree_interior_update_lock);
940                 return;
941         }
942
943         for (i = 0; i < as->nr_new_nodes; i++)
944                 if (as->new_nodes[i] == b)
945                         goto found;
946
947         BUG();
948 found:
949         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
950         mutex_unlock(&c->btree_interior_update_lock);
951
952         if (v & 1)
953                 closure_put(&as->cl);
954 }
955
956 static void btree_interior_update_add_node_reference(struct btree_update *as,
957                                                      struct btree *b)
958 {
959         struct bch_fs *c = as->c;
960         struct pending_btree_node_free *d;
961
962         mutex_lock(&c->btree_interior_update_lock);
963
964         /* Add this node to the list of nodes being freed: */
965         BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
966
967         d = &as->pending[as->nr_pending++];
968         d->index_update_done    = false;
969         d->seq                  = b->data->keys.seq;
970         d->btree_id             = b->btree_id;
971         d->level                = b->level;
972         bkey_copy(&d->key, &b->key);
973
974         mutex_unlock(&c->btree_interior_update_lock);
975 }
976
977 /*
978  * @b is being split/rewritten: it may have pointers to not-yet-written btree
979  * nodes and thus outstanding btree_updates - redirect @b's
980  * btree_updates to point to this btree_update:
981  */
982 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
983                                                struct btree *b)
984 {
985         struct bch_fs *c = as->c;
986         struct closure *cl, *cl_n;
987         struct btree_update *p, *n;
988         struct btree_write *w;
989         struct bset_tree *t;
990
991         set_btree_node_dying(b);
992
993         if (btree_node_fake(b))
994                 return;
995
996         btree_interior_update_add_node_reference(as, b);
997
998         /*
999          * Does this node have data that hasn't been written in the journal?
1000          *
1001          * If so, we have to wait for the corresponding journal entry to be
1002          * written before making the new nodes reachable - we can't just carry
1003          * over the bset->journal_seq tracking, since we'll be mixing those keys
1004          * in with keys that aren't in the journal anymore:
1005          */
1006         for_each_bset(b, t)
1007                 as->journal_seq = max(as->journal_seq,
1008                                       le64_to_cpu(bset(b, t)->journal_seq));
1009
1010         mutex_lock(&c->btree_interior_update_lock);
1011
1012         /*
1013          * Does this node have any btree_update operations preventing
1014          * it from being written?
1015          *
1016          * If so, redirect them to point to this btree_update: we can
1017          * write out our new nodes, but we won't make them visible until those
1018          * operations complete
1019          */
1020         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
1021                 list_del(&p->write_blocked_list);
1022                 btree_update_reparent(as, p);
1023
1024                 /*
1025                  * for flush_held_btree_writes() waiting on updates to flush or
1026                  * nodes to be writeable:
1027                  */
1028                 closure_wake_up(&c->btree_interior_update_wait);
1029         }
1030
1031         clear_btree_node_dirty(b);
1032         clear_btree_node_need_write(b);
1033         w = btree_current_write(b);
1034
1035         /*
1036          * Does this node have any btree_update operations waiting on this node
1037          * to be written?
1038          *
1039          * If so, wake them up when this btree_update operation is reachable:
1040          */
1041         llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
1042                 llist_add(&cl->list, &as->wait.list);
1043
1044         /*
1045          * Does this node have unwritten data that has a pin on the journal?
1046          *
1047          * If so, transfer that pin to the btree_update operation -
1048          * note that if we're freeing multiple nodes, we only need to keep the
1049          * oldest pin of any of the nodes we're freeing. We'll release the pin
1050          * when the new nodes are persistent and reachable on disk:
1051          */
1052         bch2_journal_pin_copy(&c->journal, &as->journal,
1053                               &w->journal, interior_update_flush);
1054         bch2_journal_pin_drop(&c->journal, &w->journal);
1055
1056         w = btree_prev_write(b);
1057         bch2_journal_pin_copy(&c->journal, &as->journal,
1058                               &w->journal, interior_update_flush);
1059         bch2_journal_pin_drop(&c->journal, &w->journal);
1060
1061         mutex_unlock(&c->btree_interior_update_lock);
1062 }
1063
1064 void bch2_btree_update_done(struct btree_update *as)
1065 {
1066         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
1067
1068         bch2_btree_reserve_put(as->c, as->reserve);
1069         as->reserve = NULL;
1070
1071         continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
1072 }
1073
1074 struct btree_update *
1075 bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
1076                         unsigned nr_nodes, unsigned flags,
1077                         struct closure *cl)
1078 {
1079         struct btree_reserve *reserve;
1080         struct btree_update *as;
1081
1082         reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
1083         if (IS_ERR(reserve))
1084                 return ERR_CAST(reserve);
1085
1086         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
1087         memset(as, 0, sizeof(*as));
1088         closure_init(&as->cl, NULL);
1089         as->c           = c;
1090         as->mode        = BTREE_INTERIOR_NO_UPDATE;
1091         as->btree_id    = id;
1092         as->reserve     = reserve;
1093         INIT_LIST_HEAD(&as->write_blocked_list);
1094
1095         bch2_keylist_init(&as->parent_keys, as->inline_keys);
1096
1097         mutex_lock(&c->btree_interior_update_lock);
1098         list_add_tail(&as->list, &c->btree_interior_update_list);
1099         mutex_unlock(&c->btree_interior_update_lock);
1100
1101         return as;
1102 }
1103
1104 /* Btree root updates: */
1105
1106 static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1107 {
1108         /* Root nodes cannot be reaped */
1109         mutex_lock(&c->btree_cache.lock);
1110         list_del_init(&b->list);
1111         mutex_unlock(&c->btree_cache.lock);
1112
1113         mutex_lock(&c->btree_root_lock);
1114         BUG_ON(btree_node_root(c, b) &&
1115                (b->level < btree_node_root(c, b)->level ||
1116                 !btree_node_dying(btree_node_root(c, b))));
1117
1118         btree_node_root(c, b) = b;
1119         mutex_unlock(&c->btree_root_lock);
1120
1121         bch2_recalc_btree_reserve(c);
1122 }
1123
1124 static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
1125 {
1126         struct bch_fs *c = as->c;
1127         struct btree *old = btree_node_root(c, b);
1128         struct bch_fs_usage *fs_usage;
1129
1130         __bch2_btree_set_root_inmem(c, b);
1131
1132         mutex_lock(&c->btree_interior_update_lock);
1133         percpu_down_read(&c->mark_lock);
1134         fs_usage = bch2_fs_usage_scratch_get(c);
1135
1136         bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
1137                       0, 0, fs_usage, 0,
1138                       BTREE_TRIGGER_INSERT);
1139         if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
1140                 bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
1141                                      0, 0, NULL, 0,
1142                                      BTREE_TRIGGER_INSERT|
1143                                      BTREE_TRIGGER_GC);
1144
1145         if (old && !btree_node_fake(old))
1146                 bch2_btree_node_free_index(as, NULL,
1147                                            bkey_i_to_s_c(&old->key),
1148                                            fs_usage);
1149         bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
1150
1151         bch2_fs_usage_scratch_put(c, fs_usage);
1152         percpu_up_read(&c->mark_lock);
1153         mutex_unlock(&c->btree_interior_update_lock);
1154 }
1155
1156 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
1157 {
1158         struct btree_root *r = &c->btree_roots[b->btree_id];
1159
1160         mutex_lock(&c->btree_root_lock);
1161
1162         BUG_ON(b != r->b);
1163         bkey_copy(&r->key, &b->key);
1164         r->level = b->level;
1165         r->alive = true;
1166         if (rw == WRITE)
1167                 c->btree_roots_dirty = true;
1168
1169         mutex_unlock(&c->btree_root_lock);
1170 }
1171
1172 /**
1173  * bch_btree_set_root - update the root in memory and on disk
1174  *
1175  * To ensure forward progress, the current task must not be holding any
1176  * btree node write locks. However, you must hold an intent lock on the
1177  * old root.
1178  *
1179  * Note: This allocates a journal entry but doesn't add any keys to
1180  * it.  All the btree roots are part of every journal write, so there
1181  * is nothing new to be done.  This just guarantees that there is a
1182  * journal write.
1183  */
1184 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
1185                                 struct btree_iter *iter)
1186 {
1187         struct bch_fs *c = as->c;
1188         struct btree *old;
1189
1190         trace_btree_set_root(c, b);
1191         BUG_ON(!b->written &&
1192                !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
1193
1194         old = btree_node_root(c, b);
1195
1196         /*
1197          * Ensure no one is using the old root while we switch to the
1198          * new root:
1199          */
1200         bch2_btree_node_lock_write(old, iter);
1201
1202         bch2_btree_set_root_inmem(as, b);
1203
1204         btree_update_updated_root(as);
1205
1206         /*
1207          * Unlock old root after new root is visible:
1208          *
1209          * The new root isn't persistent, but that's ok: we still have
1210          * an intent lock on the new root, and any updates that would
1211          * depend on the new root would have to update the new root.
1212          */
1213         bch2_btree_node_unlock_write(old, iter);
1214 }
1215
1216 /* Interior node updates: */
1217
1218 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1219                                         struct btree_iter *iter,
1220                                         struct bkey_i *insert,
1221                                         struct btree_node_iter *node_iter)
1222 {
1223         struct bch_fs *c = as->c;
1224         struct bch_fs_usage *fs_usage;
1225         struct bkey_packed *k;
1226         struct bkey tmp;
1227
1228         BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
1229
1230         mutex_lock(&c->btree_interior_update_lock);
1231         percpu_down_read(&c->mark_lock);
1232         fs_usage = bch2_fs_usage_scratch_get(c);
1233
1234         bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
1235                              0, 0, fs_usage, 0,
1236                              BTREE_TRIGGER_INSERT);
1237
1238         if (gc_visited(c, gc_pos_btree_node(b)))
1239                 bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
1240                                      0, 0, NULL, 0,
1241                                      BTREE_TRIGGER_INSERT|
1242                                      BTREE_TRIGGER_GC);
1243
1244         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1245                bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1246                 bch2_btree_node_iter_advance(node_iter, b);
1247
1248         /*
1249          * If we're overwriting, look up pending delete and mark so that gc
1250          * marks it on the pending delete list:
1251          */
1252         if (k && !bkey_cmp_packed(b, k, &insert->k))
1253                 bch2_btree_node_free_index(as, b,
1254                                            bkey_disassemble(b, k, &tmp),
1255                                            fs_usage);
1256
1257         bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
1258
1259         bch2_fs_usage_scratch_put(c, fs_usage);
1260         percpu_up_read(&c->mark_lock);
1261         mutex_unlock(&c->btree_interior_update_lock);
1262
1263         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1264         set_btree_node_dirty(b);
1265         set_btree_node_need_write(b);
1266 }
1267
1268 /*
1269  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1270  * node)
1271  */
1272 static struct btree *__btree_split_node(struct btree_update *as,
1273                                         struct btree *n1,
1274                                         struct btree_iter *iter)
1275 {
1276         size_t nr_packed = 0, nr_unpacked = 0;
1277         struct btree *n2;
1278         struct bset *set1, *set2;
1279         struct bkey_packed *k, *prev = NULL;
1280
1281         n2 = bch2_btree_node_alloc(as, n1->level);
1282
1283         n2->data->max_key       = n1->data->max_key;
1284         n2->data->format        = n1->format;
1285         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1286         n2->key.k.p = n1->key.k.p;
1287
1288         btree_node_set_format(n2, n2->data->format);
1289
1290         set1 = btree_bset_first(n1);
1291         set2 = btree_bset_first(n2);
1292
1293         /*
1294          * Has to be a linear search because we don't have an auxiliary
1295          * search tree yet
1296          */
1297         k = set1->start;
1298         while (1) {
1299                 struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
1300
1301                 if (n == vstruct_last(set1))
1302                         break;
1303                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1304                         break;
1305
1306                 if (bkey_packed(k))
1307                         nr_packed++;
1308                 else
1309                         nr_unpacked++;
1310
1311                 prev = k;
1312                 k = n;
1313         }
1314
1315         BUG_ON(!prev);
1316
1317         btree_set_max(n1, bkey_unpack_pos(n1, prev));
1318         btree_set_min(n2, btree_type_successor(n1->btree_id, n1->key.k.p));
1319
1320         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1321         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1322
1323         set_btree_bset_end(n1, n1->set);
1324         set_btree_bset_end(n2, n2->set);
1325
1326         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1327         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1328         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1329         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1330
1331         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1332         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1333         n1->nr.packed_keys      = nr_packed;
1334         n1->nr.unpacked_keys    = nr_unpacked;
1335
1336         BUG_ON(!set1->u64s);
1337         BUG_ON(!set2->u64s);
1338
1339         memcpy_u64s(set2->start,
1340                     vstruct_end(set1),
1341                     le16_to_cpu(set2->u64s));
1342
1343         btree_node_reset_sib_u64s(n1);
1344         btree_node_reset_sib_u64s(n2);
1345
1346         bch2_verify_btree_nr_keys(n1);
1347         bch2_verify_btree_nr_keys(n2);
1348
1349         if (n1->level) {
1350                 btree_node_interior_verify(n1);
1351                 btree_node_interior_verify(n2);
1352         }
1353
1354         return n2;
1355 }
1356
1357 /*
1358  * For updates to interior nodes, we've got to do the insert before we split
1359  * because the stuff we're inserting has to be inserted atomically. Post split,
1360  * the keys might have to go in different nodes and the split would no longer be
1361  * atomic.
1362  *
1363  * Worse, if the insert is from btree node coalescing, if we do the insert after
1364  * we do the split (and pick the pivot) - the pivot we pick might be between
1365  * nodes that were coalesced, and thus in the middle of a child node post
1366  * coalescing:
1367  */
1368 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1369                                     struct btree_iter *iter,
1370                                     struct keylist *keys)
1371 {
1372         struct btree_node_iter node_iter;
1373         struct bkey_i *k = bch2_keylist_front(keys);
1374         struct bkey_packed *src, *dst, *n;
1375         struct bset *i;
1376
1377         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1378
1379         bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1380
1381         while (!bch2_keylist_empty(keys)) {
1382                 k = bch2_keylist_front(keys);
1383
1384                 BUG_ON(bch_keylist_u64s(keys) >
1385                        bch_btree_keys_u64s_remaining(as->c, b));
1386                 BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
1387                 BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
1388
1389                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1390                 bch2_keylist_pop_front(keys);
1391         }
1392
1393         /*
1394          * We can't tolerate whiteouts here - with whiteouts there can be
1395          * duplicate keys, and it would be rather bad if we picked a duplicate
1396          * for the pivot:
1397          */
1398         i = btree_bset_first(b);
1399         src = dst = i->start;
1400         while (src != vstruct_last(i)) {
1401                 n = bkey_next_skip_noops(src, vstruct_last(i));
1402                 if (!bkey_deleted(src)) {
1403                         memmove_u64s_down(dst, src, src->u64s);
1404                         dst = bkey_next(dst);
1405                 }
1406                 src = n;
1407         }
1408
1409         i->u64s = cpu_to_le16((u64 *) dst - i->_data);
1410         set_btree_bset_end(b, b->set);
1411
1412         BUG_ON(b->nsets != 1 ||
1413                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1414
1415         btree_node_interior_verify(b);
1416 }
1417
1418 static void btree_split(struct btree_update *as, struct btree *b,
1419                         struct btree_iter *iter, struct keylist *keys,
1420                         unsigned flags)
1421 {
1422         struct bch_fs *c = as->c;
1423         struct btree *parent = btree_node_parent(iter, b);
1424         struct btree *n1, *n2 = NULL, *n3 = NULL;
1425         u64 start_time = local_clock();
1426
1427         BUG_ON(!parent && (b != btree_node_root(c, b)));
1428         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1429
1430         bch2_btree_interior_update_will_free_node(as, b);
1431
1432         n1 = bch2_btree_node_alloc_replacement(as, b);
1433
1434         if (keys)
1435                 btree_split_insert_keys(as, n1, iter, keys);
1436
1437         if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1438                 trace_btree_split(c, b);
1439
1440                 n2 = __btree_split_node(as, n1, iter);
1441
1442                 bch2_btree_build_aux_trees(n2);
1443                 bch2_btree_build_aux_trees(n1);
1444                 six_unlock_write(&n2->lock);
1445                 six_unlock_write(&n1->lock);
1446
1447                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1448
1449                 /*
1450                  * Note that on recursive parent_keys == keys, so we
1451                  * can't start adding new keys to parent_keys before emptying it
1452                  * out (which we did with btree_split_insert_keys() above)
1453                  */
1454                 bch2_keylist_add(&as->parent_keys, &n1->key);
1455                 bch2_keylist_add(&as->parent_keys, &n2->key);
1456
1457                 if (!parent) {
1458                         /* Depth increases, make a new root */
1459                         n3 = __btree_root_alloc(as, b->level + 1);
1460
1461                         n3->sib_u64s[0] = U16_MAX;
1462                         n3->sib_u64s[1] = U16_MAX;
1463
1464                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1465
1466                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1467                 }
1468         } else {
1469                 trace_btree_compact(c, b);
1470
1471                 bch2_btree_build_aux_trees(n1);
1472                 six_unlock_write(&n1->lock);
1473
1474                 bch2_keylist_add(&as->parent_keys, &n1->key);
1475         }
1476
1477         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1478
1479         /* New nodes all written, now make them visible: */
1480
1481         if (parent) {
1482                 /* Split a non root node */
1483                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1484         } else if (n3) {
1485                 bch2_btree_set_root(as, n3, iter);
1486         } else {
1487                 /* Root filled up but didn't need to be split */
1488                 bch2_btree_set_root(as, n1, iter);
1489         }
1490
1491         bch2_open_buckets_put(c, &n1->ob);
1492         if (n2)
1493                 bch2_open_buckets_put(c, &n2->ob);
1494         if (n3)
1495                 bch2_open_buckets_put(c, &n3->ob);
1496
1497         /* Successful split, update the iterator to point to the new nodes: */
1498
1499         six_lock_increment(&b->lock, SIX_LOCK_intent);
1500         bch2_btree_iter_node_drop(iter, b);
1501         if (n3)
1502                 bch2_btree_iter_node_replace(iter, n3);
1503         if (n2)
1504                 bch2_btree_iter_node_replace(iter, n2);
1505         bch2_btree_iter_node_replace(iter, n1);
1506
1507         /*
1508          * The old node must be freed (in memory) _before_ unlocking the new
1509          * nodes - else another thread could re-acquire a read lock on the old
1510          * node after another thread has locked and updated the new node, thus
1511          * seeing stale data:
1512          */
1513         bch2_btree_node_free_inmem(c, b, iter);
1514
1515         if (n3)
1516                 six_unlock_intent(&n3->lock);
1517         if (n2)
1518                 six_unlock_intent(&n2->lock);
1519         six_unlock_intent(&n1->lock);
1520
1521         bch2_btree_trans_verify_locks(iter->trans);
1522
1523         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
1524                                start_time);
1525 }
1526
1527 static void
1528 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1529                                 struct btree_iter *iter, struct keylist *keys)
1530 {
1531         struct btree_iter *linked;
1532         struct btree_node_iter node_iter;
1533         struct bkey_i *insert = bch2_keylist_front(keys);
1534         struct bkey_packed *k;
1535
1536         /* Don't screw up @iter's position: */
1537         node_iter = iter->l[b->level].iter;
1538
1539         /*
1540          * btree_split(), btree_gc_coalesce() will insert keys before
1541          * the iterator's current position - they know the keys go in
1542          * the node the iterator points to:
1543          */
1544         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1545                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1546                 ;
1547
1548         while (!bch2_keylist_empty(keys)) {
1549                 insert = bch2_keylist_front(keys);
1550
1551                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1552                 bch2_keylist_pop_front(keys);
1553         }
1554
1555         btree_update_updated_node(as, b);
1556
1557         trans_for_each_iter_with_node(iter->trans, b, linked)
1558                 bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
1559
1560         bch2_btree_trans_verify_iters(iter->trans, b);
1561 }
1562
1563 /**
1564  * bch_btree_insert_node - insert bkeys into a given btree node
1565  *
1566  * @iter:               btree iterator
1567  * @keys:               list of keys to insert
1568  * @hook:               insert callback
1569  * @persistent:         if not null, @persistent will wait on journal write
1570  *
1571  * Inserts as many keys as it can into a given btree node, splitting it if full.
1572  * If a split occurred, this function will return early. This can only happen
1573  * for leaf nodes -- inserts into interior nodes have to be atomic.
1574  */
1575 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1576                             struct btree_iter *iter, struct keylist *keys,
1577                             unsigned flags)
1578 {
1579         struct bch_fs *c = as->c;
1580         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1581         int old_live_u64s = b->nr.live_u64s;
1582         int live_u64s_added, u64s_added;
1583
1584         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1585         BUG_ON(!b->level);
1586         BUG_ON(!as || as->b);
1587         bch2_verify_keylist_sorted(keys);
1588
1589         if (as->must_rewrite)
1590                 goto split;
1591
1592         bch2_btree_node_lock_for_insert(c, b, iter);
1593
1594         if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
1595                 bch2_btree_node_unlock_write(b, iter);
1596                 goto split;
1597         }
1598
1599         bch2_btree_insert_keys_interior(as, b, iter, keys);
1600
1601         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1602         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1603
1604         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1605                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1606         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1607                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1608
1609         if (u64s_added > live_u64s_added &&
1610             bch2_maybe_compact_whiteouts(c, b))
1611                 bch2_btree_iter_reinit_node(iter, b);
1612
1613         bch2_btree_node_unlock_write(b, iter);
1614
1615         btree_node_interior_verify(b);
1616
1617         /*
1618          * when called from the btree_split path the new nodes aren't added to
1619          * the btree iterator yet, so the merge path's unlock/wait/relock dance
1620          * won't work:
1621          */
1622         bch2_foreground_maybe_merge(c, iter, b->level,
1623                                     flags|BTREE_INSERT_NOUNLOCK);
1624         return;
1625 split:
1626         btree_split(as, b, iter, keys, flags);
1627 }
1628
1629 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1630                           unsigned flags)
1631 {
1632         struct btree_trans *trans = iter->trans;
1633         struct btree *b = iter->l[0].b;
1634         struct btree_update *as;
1635         struct closure cl;
1636         int ret = 0;
1637         struct btree_iter *linked;
1638
1639         /*
1640          * We already have a disk reservation and open buckets pinned; this
1641          * allocation must not block:
1642          */
1643         trans_for_each_iter(trans, linked)
1644                 if (linked->btree_id == BTREE_ID_EXTENTS)
1645                         flags |= BTREE_INSERT_USE_RESERVE;
1646
1647         closure_init_stack(&cl);
1648
1649         /* Hack, because gc and splitting nodes doesn't mix yet: */
1650         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1651             !down_read_trylock(&c->gc_lock)) {
1652                 if (flags & BTREE_INSERT_NOUNLOCK)
1653                         return -EINTR;
1654
1655                 bch2_trans_unlock(trans);
1656                 down_read(&c->gc_lock);
1657
1658                 if (!bch2_trans_relock(trans))
1659                         ret = -EINTR;
1660         }
1661
1662         /*
1663          * XXX: figure out how far we might need to split,
1664          * instead of locking/reserving all the way to the root:
1665          */
1666         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1667                 trace_trans_restart_iter_upgrade(trans->ip);
1668                 ret = -EINTR;
1669                 goto out;
1670         }
1671
1672         as = bch2_btree_update_start(c, iter->btree_id,
1673                 btree_update_reserve_required(c, b), flags,
1674                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1675         if (IS_ERR(as)) {
1676                 ret = PTR_ERR(as);
1677                 if (ret == -EAGAIN) {
1678                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1679                         bch2_trans_unlock(trans);
1680                         ret = -EINTR;
1681                 }
1682                 goto out;
1683         }
1684
1685         btree_split(as, b, iter, NULL, flags);
1686         bch2_btree_update_done(as);
1687
1688         /*
1689          * We haven't successfully inserted yet, so don't downgrade all the way
1690          * back to read locks;
1691          */
1692         __bch2_btree_iter_downgrade(iter, 1);
1693 out:
1694         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1695                 up_read(&c->gc_lock);
1696         closure_sync(&cl);
1697         return ret;
1698 }
1699
1700 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1701                                    struct btree_iter *iter,
1702                                    unsigned level,
1703                                    unsigned flags,
1704                                    enum btree_node_sibling sib)
1705 {
1706         struct btree_trans *trans = iter->trans;
1707         struct btree_update *as;
1708         struct bkey_format_state new_s;
1709         struct bkey_format new_f;
1710         struct bkey_i delete;
1711         struct btree *b, *m, *n, *prev, *next, *parent;
1712         struct closure cl;
1713         size_t sib_u64s;
1714         int ret = 0;
1715
1716         BUG_ON(!btree_node_locked(iter, level));
1717
1718         closure_init_stack(&cl);
1719 retry:
1720         BUG_ON(!btree_node_locked(iter, level));
1721
1722         b = iter->l[level].b;
1723
1724         parent = btree_node_parent(iter, b);
1725         if (!parent)
1726                 goto out;
1727
1728         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1729                 goto out;
1730
1731         /* XXX: can't be holding read locks */
1732         m = bch2_btree_node_get_sibling(c, iter, b, sib);
1733         if (IS_ERR(m)) {
1734                 ret = PTR_ERR(m);
1735                 goto err;
1736         }
1737
1738         /* NULL means no sibling: */
1739         if (!m) {
1740                 b->sib_u64s[sib] = U16_MAX;
1741                 goto out;
1742         }
1743
1744         if (sib == btree_prev_sib) {
1745                 prev = m;
1746                 next = b;
1747         } else {
1748                 prev = b;
1749                 next = m;
1750         }
1751
1752         bch2_bkey_format_init(&new_s);
1753         __bch2_btree_calc_format(&new_s, b);
1754         __bch2_btree_calc_format(&new_s, m);
1755         new_f = bch2_bkey_format_done(&new_s);
1756
1757         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1758                 btree_node_u64s_with_format(m, &new_f);
1759
1760         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1761                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1762                 sib_u64s /= 2;
1763                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1764         }
1765
1766         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1767         b->sib_u64s[sib] = sib_u64s;
1768
1769         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1770                 six_unlock_intent(&m->lock);
1771                 goto out;
1772         }
1773
1774         /* We're changing btree topology, doesn't mix with gc: */
1775         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1776             !down_read_trylock(&c->gc_lock))
1777                 goto err_cycle_gc_lock;
1778
1779         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1780                 ret = -EINTR;
1781                 goto err_unlock;
1782         }
1783
1784         as = bch2_btree_update_start(c, iter->btree_id,
1785                          btree_update_reserve_required(c, parent) + 1,
1786                          BTREE_INSERT_NOFAIL|
1787                          BTREE_INSERT_USE_RESERVE,
1788                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1789         if (IS_ERR(as)) {
1790                 ret = PTR_ERR(as);
1791                 goto err_unlock;
1792         }
1793
1794         trace_btree_merge(c, b);
1795
1796         bch2_btree_interior_update_will_free_node(as, b);
1797         bch2_btree_interior_update_will_free_node(as, m);
1798
1799         n = bch2_btree_node_alloc(as, b->level);
1800
1801         btree_set_min(n, prev->data->min_key);
1802         btree_set_max(n, next->data->max_key);
1803         n->data->format         = new_f;
1804
1805         btree_node_set_format(n, new_f);
1806
1807         bch2_btree_sort_into(c, n, prev);
1808         bch2_btree_sort_into(c, n, next);
1809
1810         bch2_btree_build_aux_trees(n);
1811         six_unlock_write(&n->lock);
1812
1813         bkey_init(&delete.k);
1814         delete.k.p = prev->key.k.p;
1815         bch2_keylist_add(&as->parent_keys, &delete);
1816         bch2_keylist_add(&as->parent_keys, &n->key);
1817
1818         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1819
1820         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1821
1822         bch2_open_buckets_put(c, &n->ob);
1823
1824         six_lock_increment(&b->lock, SIX_LOCK_intent);
1825         bch2_btree_iter_node_drop(iter, b);
1826         bch2_btree_iter_node_drop(iter, m);
1827
1828         bch2_btree_iter_node_replace(iter, n);
1829
1830         bch2_btree_trans_verify_iters(trans, n);
1831
1832         bch2_btree_node_free_inmem(c, b, iter);
1833         bch2_btree_node_free_inmem(c, m, iter);
1834
1835         six_unlock_intent(&n->lock);
1836
1837         bch2_btree_update_done(as);
1838
1839         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1840                 up_read(&c->gc_lock);
1841 out:
1842         bch2_btree_trans_verify_locks(trans);
1843
1844         /*
1845          * Don't downgrade locks here: we're called after successful insert,
1846          * and the caller will downgrade locks after a successful insert
1847          * anyways (in case e.g. a split was required first)
1848          *
1849          * And we're also called when inserting into interior nodes in the
1850          * split path, and downgrading to read locks in there is potentially
1851          * confusing:
1852          */
1853         closure_sync(&cl);
1854         return;
1855
1856 err_cycle_gc_lock:
1857         six_unlock_intent(&m->lock);
1858
1859         if (flags & BTREE_INSERT_NOUNLOCK)
1860                 goto out;
1861
1862         bch2_trans_unlock(trans);
1863
1864         down_read(&c->gc_lock);
1865         up_read(&c->gc_lock);
1866         ret = -EINTR;
1867         goto err;
1868
1869 err_unlock:
1870         six_unlock_intent(&m->lock);
1871         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1872                 up_read(&c->gc_lock);
1873 err:
1874         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1875
1876         if ((ret == -EAGAIN || ret == -EINTR) &&
1877             !(flags & BTREE_INSERT_NOUNLOCK)) {
1878                 bch2_trans_unlock(trans);
1879                 closure_sync(&cl);
1880                 ret = bch2_btree_iter_traverse(iter);
1881                 if (ret)
1882                         goto out;
1883
1884                 goto retry;
1885         }
1886
1887         goto out;
1888 }
1889
1890 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1891                                 struct btree *b, unsigned flags,
1892                                 struct closure *cl)
1893 {
1894         struct btree *n, *parent = btree_node_parent(iter, b);
1895         struct btree_update *as;
1896
1897         as = bch2_btree_update_start(c, iter->btree_id,
1898                 (parent
1899                  ? btree_update_reserve_required(c, parent)
1900                  : 0) + 1,
1901                 flags, cl);
1902         if (IS_ERR(as)) {
1903                 trace_btree_gc_rewrite_node_fail(c, b);
1904                 return PTR_ERR(as);
1905         }
1906
1907         bch2_btree_interior_update_will_free_node(as, b);
1908
1909         n = bch2_btree_node_alloc_replacement(as, b);
1910
1911         bch2_btree_build_aux_trees(n);
1912         six_unlock_write(&n->lock);
1913
1914         trace_btree_gc_rewrite_node(c, b);
1915
1916         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1917
1918         if (parent) {
1919                 bch2_keylist_add(&as->parent_keys, &n->key);
1920                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1921         } else {
1922                 bch2_btree_set_root(as, n, iter);
1923         }
1924
1925         bch2_open_buckets_put(c, &n->ob);
1926
1927         six_lock_increment(&b->lock, SIX_LOCK_intent);
1928         bch2_btree_iter_node_drop(iter, b);
1929         bch2_btree_iter_node_replace(iter, n);
1930         bch2_btree_node_free_inmem(c, b, iter);
1931         six_unlock_intent(&n->lock);
1932
1933         bch2_btree_update_done(as);
1934         return 0;
1935 }
1936
1937 /**
1938  * bch_btree_node_rewrite - Rewrite/move a btree node
1939  *
1940  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1941  * btree_check_reserve() has to wait)
1942  */
1943 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1944                             __le64 seq, unsigned flags)
1945 {
1946         struct btree_trans *trans = iter->trans;
1947         struct closure cl;
1948         struct btree *b;
1949         int ret;
1950
1951         flags |= BTREE_INSERT_NOFAIL;
1952
1953         closure_init_stack(&cl);
1954
1955         bch2_btree_iter_upgrade(iter, U8_MAX);
1956
1957         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1958                 if (!down_read_trylock(&c->gc_lock)) {
1959                         bch2_trans_unlock(trans);
1960                         down_read(&c->gc_lock);
1961                 }
1962         }
1963
1964         while (1) {
1965                 ret = bch2_btree_iter_traverse(iter);
1966                 if (ret)
1967                         break;
1968
1969                 b = bch2_btree_iter_peek_node(iter);
1970                 if (!b || b->data->keys.seq != seq)
1971                         break;
1972
1973                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1974                 if (ret != -EAGAIN &&
1975                     ret != -EINTR)
1976                         break;
1977
1978                 bch2_trans_unlock(trans);
1979                 closure_sync(&cl);
1980         }
1981
1982         bch2_btree_iter_downgrade(iter);
1983
1984         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1985                 up_read(&c->gc_lock);
1986
1987         closure_sync(&cl);
1988         return ret;
1989 }
1990
1991 static void __bch2_btree_node_update_key(struct bch_fs *c,
1992                                          struct btree_update *as,
1993                                          struct btree_iter *iter,
1994                                          struct btree *b, struct btree *new_hash,
1995                                          struct bkey_i *new_key)
1996 {
1997         struct btree *parent;
1998         int ret;
1999
2000         /*
2001          * Two corner cases that need to be thought about here:
2002          *
2003          * @b may not be reachable yet - there might be another interior update
2004          * operation waiting on @b to be written, and we're gonna deliver the
2005          * write completion to that interior update operation _before_
2006          * persisting the new_key update
2007          *
2008          * That ends up working without us having to do anything special here:
2009          * the reason is, we do kick off (and do the in memory updates) for the
2010          * update for @new_key before we return, creating a new interior_update
2011          * operation here.
2012          *
2013          * The new interior update operation here will in effect override the
2014          * previous one. The previous one was going to terminate - make @b
2015          * reachable - in one of two ways:
2016          * - updating the btree root pointer
2017          *   In that case,
2018          *   no, this doesn't work. argh.
2019          */
2020
2021         if (b->will_make_reachable)
2022                 as->must_rewrite = true;
2023
2024         btree_interior_update_add_node_reference(as, b);
2025
2026         /*
2027          * XXX: the rest of the update path treats this like we're actually
2028          * inserting a new node and deleting the existing node, so the
2029          * reservation needs to include enough space for @b
2030          *
2031          * that is actually sketch as fuck though and I am surprised the code
2032          * seems to work like that, definitely need to go back and rework it
2033          * into something saner.
2034          *
2035          * (I think @b is just getting double counted until the btree update
2036          * finishes and "deletes" @b on disk)
2037          */
2038         ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
2039                         c->opts.btree_node_size *
2040                         bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
2041                         BCH_DISK_RESERVATION_NOFAIL);
2042         BUG_ON(ret);
2043
2044         parent = btree_node_parent(iter, b);
2045         if (parent) {
2046                 if (new_hash) {
2047                         bkey_copy(&new_hash->key, new_key);
2048                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
2049                                         new_hash, b->level, b->btree_id);
2050                         BUG_ON(ret);
2051                 }
2052
2053                 bch2_keylist_add(&as->parent_keys, new_key);
2054                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
2055
2056                 if (new_hash) {
2057                         mutex_lock(&c->btree_cache.lock);
2058                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
2059
2060                         bch2_btree_node_hash_remove(&c->btree_cache, b);
2061
2062                         bkey_copy(&b->key, new_key);
2063                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
2064                         BUG_ON(ret);
2065                         mutex_unlock(&c->btree_cache.lock);
2066                 } else {
2067                         bkey_copy(&b->key, new_key);
2068                 }
2069         } else {
2070                 struct bch_fs_usage *fs_usage;
2071
2072                 BUG_ON(btree_node_root(c, b) != b);
2073
2074                 bch2_btree_node_lock_write(b, iter);
2075
2076                 mutex_lock(&c->btree_interior_update_lock);
2077                 percpu_down_read(&c->mark_lock);
2078                 fs_usage = bch2_fs_usage_scratch_get(c);
2079
2080                 bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
2081                               0, 0, fs_usage, 0,
2082                               BTREE_TRIGGER_INSERT);
2083                 if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
2084                         bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
2085                                              0, 0, NULL, 0,
2086                                              BTREE_TRIGGER_INSERT||
2087                                              BTREE_TRIGGER_GC);
2088
2089                 bch2_btree_node_free_index(as, NULL,
2090                                            bkey_i_to_s_c(&b->key),
2091                                            fs_usage);
2092                 bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
2093
2094                 bch2_fs_usage_scratch_put(c, fs_usage);
2095                 percpu_up_read(&c->mark_lock);
2096                 mutex_unlock(&c->btree_interior_update_lock);
2097
2098                 if (btree_ptr_hash_val(new_key) != b->hash_val) {
2099                         mutex_lock(&c->btree_cache.lock);
2100                         bch2_btree_node_hash_remove(&c->btree_cache, b);
2101
2102                         bkey_copy(&b->key, new_key);
2103                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
2104                         BUG_ON(ret);
2105                         mutex_unlock(&c->btree_cache.lock);
2106                 } else {
2107                         bkey_copy(&b->key, new_key);
2108                 }
2109
2110                 btree_update_updated_root(as);
2111                 bch2_btree_node_unlock_write(b, iter);
2112         }
2113
2114         bch2_btree_update_done(as);
2115 }
2116
2117 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
2118                                struct btree *b,
2119                                struct bkey_i *new_key)
2120 {
2121         struct btree *parent = btree_node_parent(iter, b);
2122         struct btree_update *as = NULL;
2123         struct btree *new_hash = NULL;
2124         struct closure cl;
2125         int ret;
2126
2127         closure_init_stack(&cl);
2128
2129         if (!bch2_btree_iter_upgrade(iter, U8_MAX))
2130                 return -EINTR;
2131
2132         if (!down_read_trylock(&c->gc_lock)) {
2133                 bch2_trans_unlock(iter->trans);
2134                 down_read(&c->gc_lock);
2135
2136                 if (!bch2_trans_relock(iter->trans)) {
2137                         ret = -EINTR;
2138                         goto err;
2139                 }
2140         }
2141
2142         /*
2143          * check btree_ptr_hash_val() after @b is locked by
2144          * btree_iter_traverse():
2145          */
2146         if (btree_ptr_hash_val(new_key) != b->hash_val) {
2147                 /* bch2_btree_reserve_get will unlock */
2148                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2149                 if (ret) {
2150                         bch2_trans_unlock(iter->trans);
2151                         up_read(&c->gc_lock);
2152                         closure_sync(&cl);
2153                         down_read(&c->gc_lock);
2154
2155                         if (!bch2_trans_relock(iter->trans)) {
2156                                 ret = -EINTR;
2157                                 goto err;
2158                         }
2159                 }
2160
2161                 new_hash = bch2_btree_node_mem_alloc(c);
2162         }
2163
2164         as = bch2_btree_update_start(c, iter->btree_id,
2165                 parent ? btree_update_reserve_required(c, parent) : 0,
2166                 BTREE_INSERT_NOFAIL|
2167                 BTREE_INSERT_USE_RESERVE|
2168                 BTREE_INSERT_USE_ALLOC_RESERVE,
2169                 &cl);
2170
2171         if (IS_ERR(as)) {
2172                 ret = PTR_ERR(as);
2173                 if (ret == -EAGAIN)
2174                         ret = -EINTR;
2175
2176                 if (ret != -EINTR)
2177                         goto err;
2178
2179                 bch2_trans_unlock(iter->trans);
2180                 up_read(&c->gc_lock);
2181                 closure_sync(&cl);
2182                 down_read(&c->gc_lock);
2183
2184                 if (!bch2_trans_relock(iter->trans))
2185                         goto err;
2186         }
2187
2188         ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
2189         if (ret)
2190                 goto err_free_update;
2191
2192         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
2193
2194         bch2_btree_iter_downgrade(iter);
2195 err:
2196         if (new_hash) {
2197                 mutex_lock(&c->btree_cache.lock);
2198                 list_move(&new_hash->list, &c->btree_cache.freeable);
2199                 mutex_unlock(&c->btree_cache.lock);
2200
2201                 six_unlock_write(&new_hash->lock);
2202                 six_unlock_intent(&new_hash->lock);
2203         }
2204         up_read(&c->gc_lock);
2205         closure_sync(&cl);
2206         return ret;
2207 err_free_update:
2208         bch2_btree_update_free(as);
2209         goto err;
2210 }
2211
2212 /* Init code: */
2213
2214 /*
2215  * Only for filesystem bringup, when first reading the btree roots or allocating
2216  * btree roots when initializing a new filesystem:
2217  */
2218 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2219 {
2220         BUG_ON(btree_node_root(c, b));
2221
2222         __bch2_btree_set_root_inmem(c, b);
2223 }
2224
2225 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
2226 {
2227         struct closure cl;
2228         struct btree *b;
2229         int ret;
2230
2231         closure_init_stack(&cl);
2232
2233         do {
2234                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2235                 closure_sync(&cl);
2236         } while (ret);
2237
2238         b = bch2_btree_node_mem_alloc(c);
2239         bch2_btree_cache_cannibalize_unlock(c);
2240
2241         set_btree_node_fake(b);
2242         b->level        = 0;
2243         b->btree_id     = id;
2244
2245         bkey_btree_ptr_init(&b->key);
2246         b->key.k.p = POS_MAX;
2247         *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2248
2249         bch2_bset_init_first(b, &b->data->keys);
2250         bch2_btree_build_aux_trees(b);
2251
2252         b->data->flags = 0;
2253         btree_set_min(b, POS_MIN);
2254         btree_set_max(b, POS_MAX);
2255         b->data->format = bch2_btree_calc_format(b);
2256         btree_node_set_format(b, b->data->format);
2257
2258         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
2259         BUG_ON(ret);
2260
2261         __bch2_btree_set_root_inmem(c, b);
2262
2263         six_unlock_write(&b->lock);
2264         six_unlock_intent(&b->lock);
2265 }
2266
2267 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
2268 {
2269         struct printbuf out = _PBUF(buf, PAGE_SIZE);
2270         struct btree_update *as;
2271
2272         mutex_lock(&c->btree_interior_update_lock);
2273         list_for_each_entry(as, &c->btree_interior_update_list, list)
2274                 pr_buf(&out, "%p m %u w %u r %u j %llu\n",
2275                        as,
2276                        as->mode,
2277                        as->nodes_written,
2278                        atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
2279                        as->journal.seq);
2280         mutex_unlock(&c->btree_interior_update_lock);
2281
2282         return out.pos - buf;
2283 }
2284
2285 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
2286 {
2287         size_t ret = 0;
2288         struct list_head *i;
2289
2290         mutex_lock(&c->btree_interior_update_lock);
2291         list_for_each(i, &c->btree_interior_update_list)
2292                 ret++;
2293         mutex_unlock(&c->btree_interior_update_lock);
2294
2295         return ret;
2296 }