]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
Update bcachefs sources to 7bfc741f64 bcachefs: Fix a null ptr deref during journal...
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "bkey_methods.h"
6 #include "btree_cache.h"
7 #include "btree_gc.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "btree_io.h"
11 #include "btree_iter.h"
12 #include "btree_locking.h"
13 #include "buckets.h"
14 #include "extents.h"
15 #include "journal.h"
16 #include "journal_reclaim.h"
17 #include "keylist.h"
18 #include "replicas.h"
19 #include "super-io.h"
20
21 #include <linux/random.h>
22 #include <trace/events/bcachefs.h>
23
24 static void btree_node_will_make_reachable(struct btree_update *,
25                                            struct btree *);
26 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
27
28 /* Debug code: */
29
30 /*
31  * Verify that child nodes correctly span parent node's range:
32  */
33 static void btree_node_interior_verify(struct btree *b)
34 {
35 #ifdef CONFIG_BCACHEFS_DEBUG
36         struct bpos next_node = b->data->min_key;
37         struct btree_node_iter iter;
38         struct bkey_s_c k;
39         struct bkey_s_c_btree_ptr_v2 bp;
40         struct bkey unpacked;
41
42         BUG_ON(!b->level);
43
44         bch2_btree_node_iter_init_from_start(&iter, b);
45
46         while (1) {
47                 k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
48                 bp = bkey_s_c_to_btree_ptr_v2(k);
49
50                 BUG_ON(bkey_cmp(next_node, bp.v->min_key));
51
52                 bch2_btree_node_iter_advance(&iter, b);
53
54                 if (bch2_btree_node_iter_end(&iter)) {
55                         BUG_ON(bkey_cmp(k.k->p, b->key.k.p));
56                         break;
57                 }
58
59                 next_node = bkey_successor(k.k->p);
60         }
61 #endif
62 }
63
64 /* Calculate ideal packed bkey format for new btree nodes: */
65
66 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
67 {
68         struct bkey_packed *k;
69         struct bset_tree *t;
70         struct bkey uk;
71
72         bch2_bkey_format_add_pos(s, b->data->min_key);
73
74         for_each_bset(b, t)
75                 bset_tree_for_each_key(b, t, k)
76                         if (!bkey_whiteout(k)) {
77                                 uk = bkey_unpack_key(b, k);
78                                 bch2_bkey_format_add_key(s, &uk);
79                         }
80 }
81
82 static struct bkey_format bch2_btree_calc_format(struct btree *b)
83 {
84         struct bkey_format_state s;
85
86         bch2_bkey_format_init(&s);
87         __bch2_btree_calc_format(&s, b);
88
89         return bch2_bkey_format_done(&s);
90 }
91
92 static size_t btree_node_u64s_with_format(struct btree *b,
93                                           struct bkey_format *new_f)
94 {
95         struct bkey_format *old_f = &b->format;
96
97         /* stupid integer promotion rules */
98         ssize_t delta =
99             (((int) new_f->key_u64s - old_f->key_u64s) *
100              (int) b->nr.packed_keys) +
101             (((int) new_f->key_u64s - BKEY_U64s) *
102              (int) b->nr.unpacked_keys);
103
104         BUG_ON(delta + b->nr.live_u64s < 0);
105
106         return b->nr.live_u64s + delta;
107 }
108
109 /**
110  * btree_node_format_fits - check if we could rewrite node with a new format
111  *
112  * This assumes all keys can pack with the new format -- it just checks if
113  * the re-packed keys would fit inside the node itself.
114  */
115 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
116                                  struct bkey_format *new_f)
117 {
118         size_t u64s = btree_node_u64s_with_format(b, new_f);
119
120         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
121 }
122
123 /* Btree node freeing/allocation: */
124
125 static bool btree_key_matches(struct bch_fs *c,
126                               struct bkey_s_c l,
127                               struct bkey_s_c r)
128 {
129         struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(l);
130         struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(r);
131         const struct bch_extent_ptr *ptr1, *ptr2;
132
133         bkey_for_each_ptr(ptrs1, ptr1)
134                 bkey_for_each_ptr(ptrs2, ptr2)
135                         if (ptr1->dev == ptr2->dev &&
136                             ptr1->gen == ptr2->gen &&
137                             ptr1->offset == ptr2->offset)
138                                 return true;
139
140         return false;
141 }
142
143 /*
144  * We're doing the index update that makes @b unreachable, update stuff to
145  * reflect that:
146  *
147  * Must be called _before_ btree_update_updated_root() or
148  * btree_update_updated_node:
149  */
150 static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
151                                        struct bkey_s_c k,
152                                        struct bch_fs_usage *stats)
153 {
154         struct bch_fs *c = as->c;
155         struct pending_btree_node_free *d;
156
157         for (d = as->pending; d < as->pending + as->nr_pending; d++)
158                 if (!bkey_cmp(k.k->p, d->key.k.p) &&
159                     btree_key_matches(c, k, bkey_i_to_s_c(&d->key)))
160                         goto found;
161         BUG();
162 found:
163         BUG_ON(d->index_update_done);
164         d->index_update_done = true;
165
166         /*
167          * We're dropping @k from the btree, but it's still live until the
168          * index update is persistent so we need to keep a reference around for
169          * mark and sweep to find - that's primarily what the
170          * btree_node_pending_free list is for.
171          *
172          * So here (when we set index_update_done = true), we're moving an
173          * existing reference to a different part of the larger "gc keyspace" -
174          * and the new position comes after the old position, since GC marks
175          * the pending free list after it walks the btree.
176          *
177          * If we move the reference while mark and sweep is _between_ the old
178          * and the new position, mark and sweep will see the reference twice
179          * and it'll get double accounted - so check for that here and subtract
180          * to cancel out one of mark and sweep's markings if necessary:
181          */
182
183         if (gc_pos_cmp(c->gc_pos, b
184                        ? gc_pos_btree_node(b)
185                        : gc_pos_btree_root(as->btree_id)) >= 0 &&
186             gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
187                 bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
188                               0, 0, NULL, 0,
189                               BTREE_TRIGGER_OVERWRITE|
190                               BTREE_TRIGGER_GC);
191 }
192
193 static void __btree_node_free(struct bch_fs *c, struct btree *b)
194 {
195         trace_btree_node_free(c, b);
196
197         BUG_ON(btree_node_dirty(b));
198         BUG_ON(btree_node_need_write(b));
199         BUG_ON(b == btree_node_root(c, b));
200         BUG_ON(b->ob.nr);
201         BUG_ON(!list_empty(&b->write_blocked));
202         BUG_ON(b->will_make_reachable);
203
204         clear_btree_node_noevict(b);
205
206         bch2_btree_node_hash_remove(&c->btree_cache, b);
207
208         mutex_lock(&c->btree_cache.lock);
209         list_move(&b->list, &c->btree_cache.freeable);
210         mutex_unlock(&c->btree_cache.lock);
211 }
212
213 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
214 {
215         struct open_buckets ob = b->ob;
216
217         btree_update_drop_new_node(c, b);
218
219         b->ob.nr = 0;
220
221         clear_btree_node_dirty(b);
222
223         btree_node_lock_type(c, b, SIX_LOCK_write);
224         __btree_node_free(c, b);
225         six_unlock_write(&b->lock);
226
227         bch2_open_buckets_put(c, &ob);
228 }
229
230 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
231                                 struct btree_iter *iter)
232 {
233         struct btree_iter *linked;
234
235         trans_for_each_iter(iter->trans, linked)
236                 BUG_ON(linked->l[b->level].b == b);
237
238         /*
239          * Is this a node that isn't reachable on disk yet?
240          *
241          * Nodes that aren't reachable yet have writes blocked until they're
242          * reachable - now that we've cancelled any pending writes and moved
243          * things waiting on that write to wait on this update, we can drop this
244          * node from the list of nodes that the other update is making
245          * reachable, prior to freeing it:
246          */
247         btree_update_drop_new_node(c, b);
248
249         six_lock_write(&b->lock);
250         __btree_node_free(c, b);
251         six_unlock_write(&b->lock);
252         six_unlock_intent(&b->lock);
253 }
254
255 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
256                         struct pending_btree_node_free *pending,
257                         u64 journal_seq)
258 {
259         BUG_ON(!pending->index_update_done);
260
261         bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
262                       0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
263
264         if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
265                 bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
266                               0, 0, NULL, journal_seq,
267                               BTREE_TRIGGER_OVERWRITE|
268                               BTREE_TRIGGER_GC);
269 }
270
271 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
272                                              struct disk_reservation *res,
273                                              struct closure *cl,
274                                              unsigned flags)
275 {
276         struct write_point *wp;
277         struct btree *b;
278         BKEY_PADDED(k) tmp;
279         struct open_buckets ob = { .nr = 0 };
280         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
281         unsigned nr_reserve;
282         enum alloc_reserve alloc_reserve;
283
284         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
285                 nr_reserve      = 0;
286                 alloc_reserve   = RESERVE_ALLOC;
287         } else if (flags & BTREE_INSERT_USE_RESERVE) {
288                 nr_reserve      = BTREE_NODE_RESERVE / 2;
289                 alloc_reserve   = RESERVE_BTREE;
290         } else {
291                 nr_reserve      = BTREE_NODE_RESERVE;
292                 alloc_reserve   = RESERVE_NONE;
293         }
294
295         mutex_lock(&c->btree_reserve_cache_lock);
296         if (c->btree_reserve_cache_nr > nr_reserve) {
297                 struct btree_alloc *a =
298                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
299
300                 ob = a->ob;
301                 bkey_copy(&tmp.k, &a->k);
302                 mutex_unlock(&c->btree_reserve_cache_lock);
303                 goto mem_alloc;
304         }
305         mutex_unlock(&c->btree_reserve_cache_lock);
306
307 retry:
308         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
309                                       writepoint_ptr(&c->btree_write_point),
310                                       &devs_have,
311                                       res->nr_replicas,
312                                       c->opts.metadata_replicas_required,
313                                       alloc_reserve, 0, cl);
314         if (IS_ERR(wp))
315                 return ERR_CAST(wp);
316
317         if (wp->sectors_free < c->opts.btree_node_size) {
318                 struct open_bucket *ob;
319                 unsigned i;
320
321                 open_bucket_for_each(c, &wp->ptrs, ob, i)
322                         if (ob->sectors_free < c->opts.btree_node_size)
323                                 ob->sectors_free = 0;
324
325                 bch2_alloc_sectors_done(c, wp);
326                 goto retry;
327         }
328
329         if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
330                 bkey_btree_ptr_v2_init(&tmp.k);
331         else
332                 bkey_btree_ptr_init(&tmp.k);
333
334         bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
335
336         bch2_open_bucket_get(c, wp, &ob);
337         bch2_alloc_sectors_done(c, wp);
338 mem_alloc:
339         b = bch2_btree_node_mem_alloc(c);
340
341         /* we hold cannibalize_lock: */
342         BUG_ON(IS_ERR(b));
343         BUG_ON(b->ob.nr);
344
345         bkey_copy(&b->key, &tmp.k);
346         b->ob = ob;
347
348         return b;
349 }
350
351 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
352 {
353         struct bch_fs *c = as->c;
354         struct btree *b;
355         int ret;
356
357         BUG_ON(level >= BTREE_MAX_DEPTH);
358         BUG_ON(!as->reserve->nr);
359
360         b = as->reserve->b[--as->reserve->nr];
361
362         set_btree_node_accessed(b);
363         set_btree_node_dirty(b);
364         set_btree_node_need_write(b);
365
366         bch2_bset_init_first(b, &b->data->keys);
367         b->level        = level;
368         b->btree_id     = as->btree_id;
369
370         memset(&b->nr, 0, sizeof(b->nr));
371         b->data->magic = cpu_to_le64(bset_magic(c));
372         b->data->flags = 0;
373         SET_BTREE_NODE_ID(b->data, as->btree_id);
374         SET_BTREE_NODE_LEVEL(b->data, level);
375         b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
376
377         if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
378                 struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
379
380                 bp->v.mem_ptr           = 0;
381                 bp->v.seq               = b->data->keys.seq;
382                 bp->v.sectors_written   = 0;
383                 bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
384         }
385
386         if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
387                 SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
388
389         if (btree_node_is_extents(b) &&
390             !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data))
391                 set_btree_node_old_extent_overwrite(b);
392
393         bch2_btree_build_aux_trees(b);
394
395         btree_node_will_make_reachable(as, b);
396
397         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
398         BUG_ON(ret);
399
400         trace_btree_node_alloc(c, b);
401         return b;
402 }
403
404 static void btree_set_min(struct btree *b, struct bpos pos)
405 {
406         if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
407                 bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
408         b->data->min_key = pos;
409 }
410
411 static void btree_set_max(struct btree *b, struct bpos pos)
412 {
413         b->key.k.p = pos;
414         b->data->max_key = pos;
415 }
416
417 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
418                                                   struct btree *b,
419                                                   struct bkey_format format)
420 {
421         struct btree *n;
422
423         n = bch2_btree_node_alloc(as, b->level);
424
425         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
426
427         btree_set_min(n, b->data->min_key);
428         btree_set_max(n, b->data->max_key);
429
430         n->data->format         = format;
431         btree_node_set_format(n, format);
432
433         bch2_btree_sort_into(as->c, n, b);
434
435         btree_node_reset_sib_u64s(n);
436
437         n->key.k.p = b->key.k.p;
438         return n;
439 }
440
441 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
442                                                        struct btree *b)
443 {
444         struct bkey_format new_f = bch2_btree_calc_format(b);
445
446         /*
447          * The keys might expand with the new format - if they wouldn't fit in
448          * the btree node anymore, use the old format for now:
449          */
450         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
451                 new_f = b->format;
452
453         return __bch2_btree_node_alloc_replacement(as, b, new_f);
454 }
455
456 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
457 {
458         struct btree *b = bch2_btree_node_alloc(as, level);
459
460         btree_set_min(b, POS_MIN);
461         btree_set_max(b, POS_MAX);
462         b->data->format = bch2_btree_calc_format(b);
463
464         btree_node_set_format(b, b->data->format);
465         bch2_btree_build_aux_trees(b);
466
467         six_unlock_write(&b->lock);
468
469         return b;
470 }
471
472 static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
473 {
474         bch2_disk_reservation_put(c, &reserve->disk_res);
475
476         mutex_lock(&c->btree_reserve_cache_lock);
477
478         while (reserve->nr) {
479                 struct btree *b = reserve->b[--reserve->nr];
480
481                 six_unlock_write(&b->lock);
482
483                 if (c->btree_reserve_cache_nr <
484                     ARRAY_SIZE(c->btree_reserve_cache)) {
485                         struct btree_alloc *a =
486                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
487
488                         a->ob = b->ob;
489                         b->ob.nr = 0;
490                         bkey_copy(&a->k, &b->key);
491                 } else {
492                         bch2_open_buckets_put(c, &b->ob);
493                 }
494
495                 btree_node_lock_type(c, b, SIX_LOCK_write);
496                 __btree_node_free(c, b);
497                 six_unlock_write(&b->lock);
498
499                 six_unlock_intent(&b->lock);
500         }
501
502         mutex_unlock(&c->btree_reserve_cache_lock);
503
504         mempool_free(reserve, &c->btree_reserve_pool);
505 }
506
507 static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
508                                                     unsigned nr_nodes,
509                                                     unsigned flags,
510                                                     struct closure *cl)
511 {
512         struct btree_reserve *reserve;
513         struct btree *b;
514         struct disk_reservation disk_res = { 0, 0 };
515         unsigned sectors = nr_nodes * c->opts.btree_node_size;
516         int ret, disk_res_flags = 0;
517
518         if (flags & BTREE_INSERT_NOFAIL)
519                 disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
520
521         /*
522          * This check isn't necessary for correctness - it's just to potentially
523          * prevent us from doing a lot of work that'll end up being wasted:
524          */
525         ret = bch2_journal_error(&c->journal);
526         if (ret)
527                 return ERR_PTR(ret);
528
529         if (bch2_disk_reservation_get(c, &disk_res, sectors,
530                                       c->opts.metadata_replicas,
531                                       disk_res_flags))
532                 return ERR_PTR(-ENOSPC);
533
534         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
535
536         /*
537          * Protects reaping from the btree node cache and using the btree node
538          * open bucket reserve:
539          */
540         ret = bch2_btree_cache_cannibalize_lock(c, cl);
541         if (ret) {
542                 bch2_disk_reservation_put(c, &disk_res);
543                 return ERR_PTR(ret);
544         }
545
546         reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
547
548         reserve->disk_res = disk_res;
549         reserve->nr = 0;
550
551         while (reserve->nr < nr_nodes) {
552                 b = __bch2_btree_node_alloc(c, &disk_res,
553                                             flags & BTREE_INSERT_NOWAIT
554                                             ? NULL : cl, flags);
555                 if (IS_ERR(b)) {
556                         ret = PTR_ERR(b);
557                         goto err_free;
558                 }
559
560                 ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key));
561                 if (ret)
562                         goto err_free;
563
564                 reserve->b[reserve->nr++] = b;
565         }
566
567         bch2_btree_cache_cannibalize_unlock(c);
568         return reserve;
569 err_free:
570         bch2_btree_reserve_put(c, reserve);
571         bch2_btree_cache_cannibalize_unlock(c);
572         trace_btree_reserve_get_fail(c, nr_nodes, cl);
573         return ERR_PTR(ret);
574 }
575
576 /* Asynchronous interior node update machinery */
577
578 static void __bch2_btree_update_free(struct btree_update *as)
579 {
580         struct bch_fs *c = as->c;
581
582         bch2_journal_preres_put(&c->journal, &as->journal_preres);
583
584         bch2_journal_pin_drop(&c->journal, &as->journal);
585         bch2_journal_pin_flush(&c->journal, &as->journal);
586
587         BUG_ON((as->nr_new_nodes || as->nr_pending) &&
588                !bch2_journal_error(&c->journal));;
589
590         if (as->reserve)
591                 bch2_btree_reserve_put(c, as->reserve);
592
593         list_del(&as->list);
594
595         closure_debug_destroy(&as->cl);
596         mempool_free(as, &c->btree_interior_update_pool);
597
598         closure_wake_up(&c->btree_interior_update_wait);
599 }
600
601 static void bch2_btree_update_free(struct btree_update *as)
602 {
603         struct bch_fs *c = as->c;
604
605         mutex_lock(&c->btree_interior_update_lock);
606         __bch2_btree_update_free(as);
607         mutex_unlock(&c->btree_interior_update_lock);
608 }
609
610 static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
611 {
612         struct bch_fs *c = as->c;
613
614         while (as->nr_new_nodes) {
615                 struct btree *b = as->new_nodes[--as->nr_new_nodes];
616
617                 BUG_ON(b->will_make_reachable != (unsigned long) as);
618                 b->will_make_reachable = 0;
619
620                 /*
621                  * b->will_make_reachable prevented it from being written, so
622                  * write it now if it needs to be written:
623                  */
624                 btree_node_lock_type(c, b, SIX_LOCK_read);
625                 bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
626                 six_unlock_read(&b->lock);
627         }
628
629         while (as->nr_pending)
630                 bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
631                                             seq);
632 }
633
634 static void btree_update_nodes_written(struct closure *cl)
635 {
636         struct btree_update *as = container_of(cl, struct btree_update, cl);
637         struct journal_res res = { 0 };
638         struct bch_fs *c = as->c;
639         struct btree *b;
640         struct bset *i;
641         int ret;
642
643         /*
644          * We did an update to a parent node where the pointers we added pointed
645          * to child nodes that weren't written yet: now, the child nodes have
646          * been written so we can write out the update to the interior node.
647          */
648         mutex_lock(&c->btree_interior_update_lock);
649         as->nodes_written = true;
650 again:
651         as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
652                                       struct btree_update, unwritten_list);
653         if (!as || !as->nodes_written) {
654                 mutex_unlock(&c->btree_interior_update_lock);
655                 return;
656         }
657
658         b = as->b;
659         if (b && !six_trylock_intent(&b->lock)) {
660                 mutex_unlock(&c->btree_interior_update_lock);
661                 btree_node_lock_type(c, b, SIX_LOCK_intent);
662                 six_unlock_intent(&b->lock);
663                 mutex_lock(&c->btree_interior_update_lock);
664                 goto again;
665         }
666
667         list_del(&as->unwritten_list);
668
669         ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
670                                    JOURNAL_RES_GET_RESERVED);
671         if (ret) {
672                 BUG_ON(!bch2_journal_error(&c->journal));
673                 /* can't unblock btree writes */
674                 goto free_update;
675         }
676
677         {
678                 struct journal_buf *buf = &c->journal.buf[res.idx];
679                 struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
680
681                 res.offset      += as->journal_u64s;
682                 res.u64s        -= as->journal_u64s;
683                 memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
684         }
685
686         switch (as->mode) {
687         case BTREE_INTERIOR_NO_UPDATE:
688                 BUG();
689         case BTREE_INTERIOR_UPDATING_NODE:
690                 /* @b is the node we did the final insert into: */
691                 BUG_ON(!res.ref);
692
693                 six_lock_write(&b->lock);
694                 list_del(&as->write_blocked_list);
695
696                 i = btree_bset_last(b);
697                 i->journal_seq = cpu_to_le64(
698                         max(res.seq,
699                             le64_to_cpu(i->journal_seq)));
700
701                 bch2_btree_add_journal_pin(c, b, res.seq);
702                 six_unlock_write(&b->lock);
703                 break;
704
705         case BTREE_INTERIOR_UPDATING_AS:
706                 BUG_ON(b);
707                 break;
708
709         case BTREE_INTERIOR_UPDATING_ROOT: {
710                 struct btree_root *r = &c->btree_roots[as->btree_id];
711
712                 BUG_ON(b);
713
714                 mutex_lock(&c->btree_root_lock);
715                 bkey_copy(&r->key, as->parent_keys.keys);
716                 r->level = as->level;
717                 r->alive = true;
718                 c->btree_roots_dirty = true;
719                 mutex_unlock(&c->btree_root_lock);
720                 break;
721         }
722         }
723
724         bch2_journal_pin_drop(&c->journal, &as->journal);
725
726         bch2_journal_res_put(&c->journal, &res);
727         bch2_journal_preres_put(&c->journal, &as->journal_preres);
728
729         /* Do btree write after dropping journal res: */
730         if (b) {
731                 /*
732                  * b->write_blocked prevented it from being written, so
733                  * write it now if it needs to be written:
734                  */
735                 btree_node_write_if_need(c, b, SIX_LOCK_intent);
736                 six_unlock_intent(&b->lock);
737         }
738
739         btree_update_nodes_reachable(as, res.seq);
740 free_update:
741         __bch2_btree_update_free(as);
742         /*
743          * for flush_held_btree_writes() waiting on updates to flush or
744          * nodes to be writeable:
745          */
746         closure_wake_up(&c->btree_interior_update_wait);
747         goto again;
748 }
749
750 /*
751  * We're updating @b with pointers to nodes that haven't finished writing yet:
752  * block @b from being written until @as completes
753  */
754 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
755 {
756         struct bch_fs *c = as->c;
757
758         mutex_lock(&c->btree_interior_update_lock);
759         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
760
761         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
762         BUG_ON(!btree_node_dirty(b));
763
764         as->mode        = BTREE_INTERIOR_UPDATING_NODE;
765         as->b           = b;
766         as->level       = b->level;
767         list_add(&as->write_blocked_list, &b->write_blocked);
768
769         mutex_unlock(&c->btree_interior_update_lock);
770 }
771
772 static void btree_update_reparent(struct btree_update *as,
773                                   struct btree_update *child)
774 {
775         struct bch_fs *c = as->c;
776
777         lockdep_assert_held(&c->btree_interior_update_lock);
778
779         child->b = NULL;
780         child->mode = BTREE_INTERIOR_UPDATING_AS;
781
782         /*
783          * When we write a new btree root, we have to drop our journal pin
784          * _before_ the new nodes are technically reachable; see
785          * btree_update_nodes_written().
786          *
787          * This goes for journal pins that are recursively blocked on us - so,
788          * just transfer the journal pin to the new interior update so
789          * btree_update_nodes_written() can drop it.
790          */
791         bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
792         bch2_journal_pin_drop(&c->journal, &child->journal);
793 }
794
795 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
796 {
797         struct bch_fs *c = as->c;
798
799         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
800         BUG_ON(!bch2_keylist_empty(&as->parent_keys));
801
802         mutex_lock(&c->btree_interior_update_lock);
803         list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
804
805         as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
806         as->level       = b->level;
807         bch2_keylist_add(&as->parent_keys, &b->key);
808         mutex_unlock(&c->btree_interior_update_lock);
809 }
810
811 static void btree_node_will_make_reachable(struct btree_update *as,
812                                            struct btree *b)
813 {
814         struct bch_fs *c = as->c;
815
816         mutex_lock(&c->btree_interior_update_lock);
817         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
818         BUG_ON(b->will_make_reachable);
819
820         as->new_nodes[as->nr_new_nodes++] = b;
821         b->will_make_reachable = 1UL|(unsigned long) as;
822
823         closure_get(&as->cl);
824         mutex_unlock(&c->btree_interior_update_lock);
825 }
826
827 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
828 {
829         struct btree_update *as;
830         unsigned long v;
831         unsigned i;
832
833         mutex_lock(&c->btree_interior_update_lock);
834         v = xchg(&b->will_make_reachable, 0);
835         as = (struct btree_update *) (v & ~1UL);
836
837         if (!as) {
838                 mutex_unlock(&c->btree_interior_update_lock);
839                 return;
840         }
841
842         for (i = 0; i < as->nr_new_nodes; i++)
843                 if (as->new_nodes[i] == b)
844                         goto found;
845
846         BUG();
847 found:
848         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
849         mutex_unlock(&c->btree_interior_update_lock);
850
851         if (v & 1)
852                 closure_put(&as->cl);
853 }
854
855 static void btree_interior_update_add_node_reference(struct btree_update *as,
856                                                      struct btree *b)
857 {
858         struct bch_fs *c = as->c;
859         struct pending_btree_node_free *d;
860
861         mutex_lock(&c->btree_interior_update_lock);
862
863         /* Add this node to the list of nodes being freed: */
864         BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
865
866         d = &as->pending[as->nr_pending++];
867         d->index_update_done    = false;
868         d->seq                  = b->data->keys.seq;
869         d->btree_id             = b->btree_id;
870         d->level                = b->level;
871         bkey_copy(&d->key, &b->key);
872
873         mutex_unlock(&c->btree_interior_update_lock);
874 }
875
876 /*
877  * @b is being split/rewritten: it may have pointers to not-yet-written btree
878  * nodes and thus outstanding btree_updates - redirect @b's
879  * btree_updates to point to this btree_update:
880  */
881 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
882                                                struct btree *b)
883 {
884         struct bch_fs *c = as->c;
885         struct btree_update *p, *n;
886         struct btree_write *w;
887
888         set_btree_node_dying(b);
889
890         if (btree_node_fake(b))
891                 return;
892
893         btree_interior_update_add_node_reference(as, b);
894
895         mutex_lock(&c->btree_interior_update_lock);
896
897         /*
898          * Does this node have any btree_update operations preventing
899          * it from being written?
900          *
901          * If so, redirect them to point to this btree_update: we can
902          * write out our new nodes, but we won't make them visible until those
903          * operations complete
904          */
905         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
906                 list_del(&p->write_blocked_list);
907                 btree_update_reparent(as, p);
908
909                 /*
910                  * for flush_held_btree_writes() waiting on updates to flush or
911                  * nodes to be writeable:
912                  */
913                 closure_wake_up(&c->btree_interior_update_wait);
914         }
915
916         clear_btree_node_dirty(b);
917         clear_btree_node_need_write(b);
918
919         /*
920          * Does this node have unwritten data that has a pin on the journal?
921          *
922          * If so, transfer that pin to the btree_update operation -
923          * note that if we're freeing multiple nodes, we only need to keep the
924          * oldest pin of any of the nodes we're freeing. We'll release the pin
925          * when the new nodes are persistent and reachable on disk:
926          */
927         w = btree_current_write(b);
928         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
929         bch2_journal_pin_drop(&c->journal, &w->journal);
930
931         w = btree_prev_write(b);
932         bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
933         bch2_journal_pin_drop(&c->journal, &w->journal);
934
935         mutex_unlock(&c->btree_interior_update_lock);
936 }
937
938 void bch2_btree_update_done(struct btree_update *as)
939 {
940         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
941
942         bch2_btree_reserve_put(as->c, as->reserve);
943         as->reserve = NULL;
944
945         continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
946 }
947
948 struct btree_update *
949 bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
950                         unsigned nr_nodes, unsigned flags,
951                         struct closure *cl)
952 {
953         struct btree_reserve *reserve;
954         struct btree_update *as;
955         int ret;
956
957         reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
958         if (IS_ERR(reserve))
959                 return ERR_CAST(reserve);
960
961         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
962         memset(as, 0, sizeof(*as));
963         closure_init(&as->cl, NULL);
964         as->c           = c;
965         as->mode        = BTREE_INTERIOR_NO_UPDATE;
966         as->btree_id    = id;
967         as->reserve     = reserve;
968         INIT_LIST_HEAD(&as->write_blocked_list);
969
970         bch2_keylist_init(&as->parent_keys, as->inline_keys);
971
972         ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
973                                       ARRAY_SIZE(as->journal_entries), 0);
974         if (ret) {
975                 bch2_btree_reserve_put(c, reserve);
976                 closure_debug_destroy(&as->cl);
977                 mempool_free(as, &c->btree_interior_update_pool);
978                 return ERR_PTR(ret);
979         }
980
981         mutex_lock(&c->btree_interior_update_lock);
982         list_add_tail(&as->list, &c->btree_interior_update_list);
983         mutex_unlock(&c->btree_interior_update_lock);
984
985         return as;
986 }
987
988 /* Btree root updates: */
989
990 static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
991 {
992         /* Root nodes cannot be reaped */
993         mutex_lock(&c->btree_cache.lock);
994         list_del_init(&b->list);
995         mutex_unlock(&c->btree_cache.lock);
996
997         mutex_lock(&c->btree_root_lock);
998         BUG_ON(btree_node_root(c, b) &&
999                (b->level < btree_node_root(c, b)->level ||
1000                 !btree_node_dying(btree_node_root(c, b))));
1001
1002         btree_node_root(c, b) = b;
1003         mutex_unlock(&c->btree_root_lock);
1004
1005         bch2_recalc_btree_reserve(c);
1006 }
1007
1008 static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
1009 {
1010         struct bch_fs *c = as->c;
1011         struct btree *old = btree_node_root(c, b);
1012         struct bch_fs_usage *fs_usage;
1013
1014         __bch2_btree_set_root_inmem(c, b);
1015
1016         mutex_lock(&c->btree_interior_update_lock);
1017         percpu_down_read(&c->mark_lock);
1018         fs_usage = bch2_fs_usage_scratch_get(c);
1019
1020         bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
1021                       0, 0, fs_usage, 0,
1022                       BTREE_TRIGGER_INSERT);
1023         if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
1024                 bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
1025                                      0, 0, NULL, 0,
1026                                      BTREE_TRIGGER_INSERT|
1027                                      BTREE_TRIGGER_GC);
1028
1029         if (old && !btree_node_fake(old))
1030                 bch2_btree_node_free_index(as, NULL,
1031                                            bkey_i_to_s_c(&old->key),
1032                                            fs_usage);
1033         bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
1034
1035         bch2_fs_usage_scratch_put(c, fs_usage);
1036         percpu_up_read(&c->mark_lock);
1037         mutex_unlock(&c->btree_interior_update_lock);
1038 }
1039
1040 /**
1041  * bch_btree_set_root - update the root in memory and on disk
1042  *
1043  * To ensure forward progress, the current task must not be holding any
1044  * btree node write locks. However, you must hold an intent lock on the
1045  * old root.
1046  *
1047  * Note: This allocates a journal entry but doesn't add any keys to
1048  * it.  All the btree roots are part of every journal write, so there
1049  * is nothing new to be done.  This just guarantees that there is a
1050  * journal write.
1051  */
1052 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
1053                                 struct btree_iter *iter)
1054 {
1055         struct bch_fs *c = as->c;
1056         struct btree *old;
1057
1058         trace_btree_set_root(c, b);
1059         BUG_ON(!b->written &&
1060                !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
1061
1062         old = btree_node_root(c, b);
1063
1064         /*
1065          * Ensure no one is using the old root while we switch to the
1066          * new root:
1067          */
1068         bch2_btree_node_lock_write(old, iter);
1069
1070         bch2_btree_set_root_inmem(as, b);
1071
1072         btree_update_updated_root(as, b);
1073
1074         /*
1075          * Unlock old root after new root is visible:
1076          *
1077          * The new root isn't persistent, but that's ok: we still have
1078          * an intent lock on the new root, and any updates that would
1079          * depend on the new root would have to update the new root.
1080          */
1081         bch2_btree_node_unlock_write(old, iter);
1082 }
1083
1084 /* Interior node updates: */
1085
1086 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1087                                         struct btree_iter *iter,
1088                                         struct bkey_i *insert,
1089                                         struct btree_node_iter *node_iter)
1090 {
1091         struct bch_fs *c = as->c;
1092         struct bch_fs_usage *fs_usage;
1093         struct jset_entry *entry;
1094         struct bkey_packed *k;
1095         struct bkey tmp;
1096
1097         BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
1098                ARRAY_SIZE(as->journal_entries));
1099
1100         entry = (void *) &as->journal_entries[as->journal_u64s];
1101         memset(entry, 0, sizeof(*entry));
1102         entry->u64s     = cpu_to_le16(insert->k.u64s);
1103         entry->type     = BCH_JSET_ENTRY_btree_keys;
1104         entry->btree_id = b->btree_id;
1105         entry->level    = b->level;
1106         memcpy_u64s_small(entry->_data, insert, insert->k.u64s);
1107         as->journal_u64s += jset_u64s(insert->k.u64s);
1108
1109         mutex_lock(&c->btree_interior_update_lock);
1110         percpu_down_read(&c->mark_lock);
1111         fs_usage = bch2_fs_usage_scratch_get(c);
1112
1113         bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
1114                              0, 0, fs_usage, 0,
1115                              BTREE_TRIGGER_INSERT);
1116
1117         if (gc_visited(c, gc_pos_btree_node(b)))
1118                 bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
1119                                      0, 0, NULL, 0,
1120                                      BTREE_TRIGGER_INSERT|
1121                                      BTREE_TRIGGER_GC);
1122
1123         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1124                bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1125                 bch2_btree_node_iter_advance(node_iter, b);
1126
1127         /*
1128          * If we're overwriting, look up pending delete and mark so that gc
1129          * marks it on the pending delete list:
1130          */
1131         if (k && !bkey_cmp_packed(b, k, &insert->k))
1132                 bch2_btree_node_free_index(as, b,
1133                                            bkey_disassemble(b, k, &tmp),
1134                                            fs_usage);
1135
1136         bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
1137
1138         bch2_fs_usage_scratch_put(c, fs_usage);
1139         percpu_up_read(&c->mark_lock);
1140         mutex_unlock(&c->btree_interior_update_lock);
1141
1142         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1143         set_btree_node_dirty(b);
1144         set_btree_node_need_write(b);
1145 }
1146
1147 /*
1148  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1149  * node)
1150  */
1151 static struct btree *__btree_split_node(struct btree_update *as,
1152                                         struct btree *n1,
1153                                         struct btree_iter *iter)
1154 {
1155         size_t nr_packed = 0, nr_unpacked = 0;
1156         struct btree *n2;
1157         struct bset *set1, *set2;
1158         struct bkey_packed *k, *prev = NULL;
1159
1160         n2 = bch2_btree_node_alloc(as, n1->level);
1161
1162         n2->data->max_key       = n1->data->max_key;
1163         n2->data->format        = n1->format;
1164         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1165         n2->key.k.p = n1->key.k.p;
1166
1167         btree_node_set_format(n2, n2->data->format);
1168
1169         set1 = btree_bset_first(n1);
1170         set2 = btree_bset_first(n2);
1171
1172         /*
1173          * Has to be a linear search because we don't have an auxiliary
1174          * search tree yet
1175          */
1176         k = set1->start;
1177         while (1) {
1178                 struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
1179
1180                 if (n == vstruct_last(set1))
1181                         break;
1182                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1183                         break;
1184
1185                 if (bkey_packed(k))
1186                         nr_packed++;
1187                 else
1188                         nr_unpacked++;
1189
1190                 prev = k;
1191                 k = n;
1192         }
1193
1194         BUG_ON(!prev);
1195
1196         btree_set_max(n1, bkey_unpack_pos(n1, prev));
1197         btree_set_min(n2, bkey_successor(n1->key.k.p));
1198
1199         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1200         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1201
1202         set_btree_bset_end(n1, n1->set);
1203         set_btree_bset_end(n2, n2->set);
1204
1205         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1206         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1207         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1208         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1209
1210         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1211         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1212         n1->nr.packed_keys      = nr_packed;
1213         n1->nr.unpacked_keys    = nr_unpacked;
1214
1215         BUG_ON(!set1->u64s);
1216         BUG_ON(!set2->u64s);
1217
1218         memcpy_u64s(set2->start,
1219                     vstruct_end(set1),
1220                     le16_to_cpu(set2->u64s));
1221
1222         btree_node_reset_sib_u64s(n1);
1223         btree_node_reset_sib_u64s(n2);
1224
1225         bch2_verify_btree_nr_keys(n1);
1226         bch2_verify_btree_nr_keys(n2);
1227
1228         if (n1->level) {
1229                 btree_node_interior_verify(n1);
1230                 btree_node_interior_verify(n2);
1231         }
1232
1233         return n2;
1234 }
1235
1236 /*
1237  * For updates to interior nodes, we've got to do the insert before we split
1238  * because the stuff we're inserting has to be inserted atomically. Post split,
1239  * the keys might have to go in different nodes and the split would no longer be
1240  * atomic.
1241  *
1242  * Worse, if the insert is from btree node coalescing, if we do the insert after
1243  * we do the split (and pick the pivot) - the pivot we pick might be between
1244  * nodes that were coalesced, and thus in the middle of a child node post
1245  * coalescing:
1246  */
1247 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1248                                     struct btree_iter *iter,
1249                                     struct keylist *keys)
1250 {
1251         struct btree_node_iter node_iter;
1252         struct bkey_i *k = bch2_keylist_front(keys);
1253         struct bkey_packed *src, *dst, *n;
1254         struct bset *i;
1255
1256         /*
1257          * XXX
1258          *
1259          * these updates must be journalled
1260          *
1261          * oops
1262          */
1263
1264         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1265
1266         bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1267
1268         while (!bch2_keylist_empty(keys)) {
1269                 k = bch2_keylist_front(keys);
1270
1271                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1272                 bch2_keylist_pop_front(keys);
1273         }
1274
1275         /*
1276          * We can't tolerate whiteouts here - with whiteouts there can be
1277          * duplicate keys, and it would be rather bad if we picked a duplicate
1278          * for the pivot:
1279          */
1280         i = btree_bset_first(b);
1281         src = dst = i->start;
1282         while (src != vstruct_last(i)) {
1283                 n = bkey_next_skip_noops(src, vstruct_last(i));
1284                 if (!bkey_deleted(src)) {
1285                         memmove_u64s_down(dst, src, src->u64s);
1286                         dst = bkey_next(dst);
1287                 }
1288                 src = n;
1289         }
1290
1291         i->u64s = cpu_to_le16((u64 *) dst - i->_data);
1292         set_btree_bset_end(b, b->set);
1293
1294         BUG_ON(b->nsets != 1 ||
1295                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1296
1297         btree_node_interior_verify(b);
1298 }
1299
1300 static void btree_split(struct btree_update *as, struct btree *b,
1301                         struct btree_iter *iter, struct keylist *keys,
1302                         unsigned flags)
1303 {
1304         struct bch_fs *c = as->c;
1305         struct btree *parent = btree_node_parent(iter, b);
1306         struct btree *n1, *n2 = NULL, *n3 = NULL;
1307         u64 start_time = local_clock();
1308
1309         BUG_ON(!parent && (b != btree_node_root(c, b)));
1310         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1311
1312         bch2_btree_interior_update_will_free_node(as, b);
1313
1314         n1 = bch2_btree_node_alloc_replacement(as, b);
1315
1316         if (keys)
1317                 btree_split_insert_keys(as, n1, iter, keys);
1318
1319         if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1320                 trace_btree_split(c, b);
1321
1322                 n2 = __btree_split_node(as, n1, iter);
1323
1324                 bch2_btree_build_aux_trees(n2);
1325                 bch2_btree_build_aux_trees(n1);
1326                 six_unlock_write(&n2->lock);
1327                 six_unlock_write(&n1->lock);
1328
1329                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1330
1331                 /*
1332                  * Note that on recursive parent_keys == keys, so we
1333                  * can't start adding new keys to parent_keys before emptying it
1334                  * out (which we did with btree_split_insert_keys() above)
1335                  */
1336                 bch2_keylist_add(&as->parent_keys, &n1->key);
1337                 bch2_keylist_add(&as->parent_keys, &n2->key);
1338
1339                 if (!parent) {
1340                         /* Depth increases, make a new root */
1341                         n3 = __btree_root_alloc(as, b->level + 1);
1342
1343                         n3->sib_u64s[0] = U16_MAX;
1344                         n3->sib_u64s[1] = U16_MAX;
1345
1346                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1347
1348                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1349                 }
1350         } else {
1351                 trace_btree_compact(c, b);
1352
1353                 bch2_btree_build_aux_trees(n1);
1354                 six_unlock_write(&n1->lock);
1355
1356                 if (parent)
1357                         bch2_keylist_add(&as->parent_keys, &n1->key);
1358         }
1359
1360         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1361
1362         /* New nodes all written, now make them visible: */
1363
1364         if (parent) {
1365                 /* Split a non root node */
1366                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1367         } else if (n3) {
1368                 bch2_btree_set_root(as, n3, iter);
1369         } else {
1370                 /* Root filled up but didn't need to be split */
1371                 bch2_btree_set_root(as, n1, iter);
1372         }
1373
1374         bch2_open_buckets_put(c, &n1->ob);
1375         if (n2)
1376                 bch2_open_buckets_put(c, &n2->ob);
1377         if (n3)
1378                 bch2_open_buckets_put(c, &n3->ob);
1379
1380         /* Successful split, update the iterator to point to the new nodes: */
1381
1382         six_lock_increment(&b->lock, SIX_LOCK_intent);
1383         bch2_btree_iter_node_drop(iter, b);
1384         if (n3)
1385                 bch2_btree_iter_node_replace(iter, n3);
1386         if (n2)
1387                 bch2_btree_iter_node_replace(iter, n2);
1388         bch2_btree_iter_node_replace(iter, n1);
1389
1390         /*
1391          * The old node must be freed (in memory) _before_ unlocking the new
1392          * nodes - else another thread could re-acquire a read lock on the old
1393          * node after another thread has locked and updated the new node, thus
1394          * seeing stale data:
1395          */
1396         bch2_btree_node_free_inmem(c, b, iter);
1397
1398         if (n3)
1399                 six_unlock_intent(&n3->lock);
1400         if (n2)
1401                 six_unlock_intent(&n2->lock);
1402         six_unlock_intent(&n1->lock);
1403
1404         bch2_btree_trans_verify_locks(iter->trans);
1405
1406         bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
1407                                start_time);
1408 }
1409
1410 static void
1411 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1412                                 struct btree_iter *iter, struct keylist *keys)
1413 {
1414         struct btree_iter *linked;
1415         struct btree_node_iter node_iter;
1416         struct bkey_i *insert = bch2_keylist_front(keys);
1417         struct bkey_packed *k;
1418
1419         /* Don't screw up @iter's position: */
1420         node_iter = iter->l[b->level].iter;
1421
1422         /*
1423          * btree_split(), btree_gc_coalesce() will insert keys before
1424          * the iterator's current position - they know the keys go in
1425          * the node the iterator points to:
1426          */
1427         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1428                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1429                 ;
1430
1431         for_each_keylist_key(keys, insert)
1432                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1433
1434         btree_update_updated_node(as, b);
1435
1436         trans_for_each_iter_with_node(iter->trans, b, linked)
1437                 bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
1438
1439         bch2_btree_trans_verify_iters(iter->trans, b);
1440 }
1441
1442 /**
1443  * bch_btree_insert_node - insert bkeys into a given btree node
1444  *
1445  * @iter:               btree iterator
1446  * @keys:               list of keys to insert
1447  * @hook:               insert callback
1448  * @persistent:         if not null, @persistent will wait on journal write
1449  *
1450  * Inserts as many keys as it can into a given btree node, splitting it if full.
1451  * If a split occurred, this function will return early. This can only happen
1452  * for leaf nodes -- inserts into interior nodes have to be atomic.
1453  */
1454 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1455                             struct btree_iter *iter, struct keylist *keys,
1456                             unsigned flags)
1457 {
1458         struct bch_fs *c = as->c;
1459         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1460         int old_live_u64s = b->nr.live_u64s;
1461         int live_u64s_added, u64s_added;
1462
1463         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1464         BUG_ON(!b->level);
1465         BUG_ON(!as || as->b);
1466         bch2_verify_keylist_sorted(keys);
1467
1468         if (as->must_rewrite)
1469                 goto split;
1470
1471         bch2_btree_node_lock_for_insert(c, b, iter);
1472
1473         if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
1474                 bch2_btree_node_unlock_write(b, iter);
1475                 goto split;
1476         }
1477
1478         bch2_btree_insert_keys_interior(as, b, iter, keys);
1479
1480         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1481         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1482
1483         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1484                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1485         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1486                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1487
1488         if (u64s_added > live_u64s_added &&
1489             bch2_maybe_compact_whiteouts(c, b))
1490                 bch2_btree_iter_reinit_node(iter, b);
1491
1492         bch2_btree_node_unlock_write(b, iter);
1493
1494         btree_node_interior_verify(b);
1495
1496         /*
1497          * when called from the btree_split path the new nodes aren't added to
1498          * the btree iterator yet, so the merge path's unlock/wait/relock dance
1499          * won't work:
1500          */
1501         bch2_foreground_maybe_merge(c, iter, b->level,
1502                                     flags|BTREE_INSERT_NOUNLOCK);
1503         return;
1504 split:
1505         btree_split(as, b, iter, keys, flags);
1506 }
1507
1508 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1509                           unsigned flags)
1510 {
1511         struct btree_trans *trans = iter->trans;
1512         struct btree *b = iter_l(iter)->b;
1513         struct btree_update *as;
1514         struct closure cl;
1515         int ret = 0;
1516         struct btree_iter *linked;
1517
1518         /*
1519          * We already have a disk reservation and open buckets pinned; this
1520          * allocation must not block:
1521          */
1522         trans_for_each_iter(trans, linked)
1523                 if (linked->btree_id == BTREE_ID_EXTENTS)
1524                         flags |= BTREE_INSERT_USE_RESERVE;
1525
1526         closure_init_stack(&cl);
1527
1528         /* Hack, because gc and splitting nodes doesn't mix yet: */
1529         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1530             !down_read_trylock(&c->gc_lock)) {
1531                 if (flags & BTREE_INSERT_NOUNLOCK)
1532                         return -EINTR;
1533
1534                 bch2_trans_unlock(trans);
1535                 down_read(&c->gc_lock);
1536
1537                 if (!bch2_trans_relock(trans))
1538                         ret = -EINTR;
1539         }
1540
1541         /*
1542          * XXX: figure out how far we might need to split,
1543          * instead of locking/reserving all the way to the root:
1544          */
1545         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1546                 trace_trans_restart_iter_upgrade(trans->ip);
1547                 ret = -EINTR;
1548                 goto out;
1549         }
1550
1551         as = bch2_btree_update_start(c, iter->btree_id,
1552                 btree_update_reserve_required(c, b), flags,
1553                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1554         if (IS_ERR(as)) {
1555                 ret = PTR_ERR(as);
1556                 if (ret == -EAGAIN) {
1557                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1558                         bch2_trans_unlock(trans);
1559                         ret = -EINTR;
1560                 }
1561                 goto out;
1562         }
1563
1564         btree_split(as, b, iter, NULL, flags);
1565         bch2_btree_update_done(as);
1566
1567         /*
1568          * We haven't successfully inserted yet, so don't downgrade all the way
1569          * back to read locks;
1570          */
1571         __bch2_btree_iter_downgrade(iter, 1);
1572 out:
1573         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1574                 up_read(&c->gc_lock);
1575         closure_sync(&cl);
1576         return ret;
1577 }
1578
1579 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1580                                    struct btree_iter *iter,
1581                                    unsigned level,
1582                                    unsigned flags,
1583                                    enum btree_node_sibling sib)
1584 {
1585         struct btree_trans *trans = iter->trans;
1586         struct btree_update *as;
1587         struct bkey_format_state new_s;
1588         struct bkey_format new_f;
1589         struct bkey_i delete;
1590         struct btree *b, *m, *n, *prev, *next, *parent;
1591         struct closure cl;
1592         size_t sib_u64s;
1593         int ret = 0;
1594
1595         BUG_ON(!btree_node_locked(iter, level));
1596
1597         closure_init_stack(&cl);
1598 retry:
1599         BUG_ON(!btree_node_locked(iter, level));
1600
1601         b = iter->l[level].b;
1602
1603         parent = btree_node_parent(iter, b);
1604         if (!parent)
1605                 goto out;
1606
1607         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1608                 goto out;
1609
1610         /* XXX: can't be holding read locks */
1611         m = bch2_btree_node_get_sibling(c, iter, b, sib);
1612         if (IS_ERR(m)) {
1613                 ret = PTR_ERR(m);
1614                 goto err;
1615         }
1616
1617         /* NULL means no sibling: */
1618         if (!m) {
1619                 b->sib_u64s[sib] = U16_MAX;
1620                 goto out;
1621         }
1622
1623         if (sib == btree_prev_sib) {
1624                 prev = m;
1625                 next = b;
1626         } else {
1627                 prev = b;
1628                 next = m;
1629         }
1630
1631         bch2_bkey_format_init(&new_s);
1632         __bch2_btree_calc_format(&new_s, b);
1633         __bch2_btree_calc_format(&new_s, m);
1634         new_f = bch2_bkey_format_done(&new_s);
1635
1636         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1637                 btree_node_u64s_with_format(m, &new_f);
1638
1639         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1640                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1641                 sib_u64s /= 2;
1642                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1643         }
1644
1645         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1646         b->sib_u64s[sib] = sib_u64s;
1647
1648         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1649                 six_unlock_intent(&m->lock);
1650                 goto out;
1651         }
1652
1653         /* We're changing btree topology, doesn't mix with gc: */
1654         if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
1655             !down_read_trylock(&c->gc_lock))
1656                 goto err_cycle_gc_lock;
1657
1658         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1659                 ret = -EINTR;
1660                 goto err_unlock;
1661         }
1662
1663         as = bch2_btree_update_start(c, iter->btree_id,
1664                          btree_update_reserve_required(c, parent) + 1,
1665                          BTREE_INSERT_NOFAIL|
1666                          BTREE_INSERT_USE_RESERVE,
1667                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1668         if (IS_ERR(as)) {
1669                 ret = PTR_ERR(as);
1670                 goto err_unlock;
1671         }
1672
1673         trace_btree_merge(c, b);
1674
1675         bch2_btree_interior_update_will_free_node(as, b);
1676         bch2_btree_interior_update_will_free_node(as, m);
1677
1678         n = bch2_btree_node_alloc(as, b->level);
1679
1680         btree_set_min(n, prev->data->min_key);
1681         btree_set_max(n, next->data->max_key);
1682         n->data->format         = new_f;
1683
1684         btree_node_set_format(n, new_f);
1685
1686         bch2_btree_sort_into(c, n, prev);
1687         bch2_btree_sort_into(c, n, next);
1688
1689         bch2_btree_build_aux_trees(n);
1690         six_unlock_write(&n->lock);
1691
1692         bkey_init(&delete.k);
1693         delete.k.p = prev->key.k.p;
1694         bch2_keylist_add(&as->parent_keys, &delete);
1695         bch2_keylist_add(&as->parent_keys, &n->key);
1696
1697         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1698
1699         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1700
1701         bch2_open_buckets_put(c, &n->ob);
1702
1703         six_lock_increment(&b->lock, SIX_LOCK_intent);
1704         bch2_btree_iter_node_drop(iter, b);
1705         bch2_btree_iter_node_drop(iter, m);
1706
1707         bch2_btree_iter_node_replace(iter, n);
1708
1709         bch2_btree_trans_verify_iters(trans, n);
1710
1711         bch2_btree_node_free_inmem(c, b, iter);
1712         bch2_btree_node_free_inmem(c, m, iter);
1713
1714         six_unlock_intent(&n->lock);
1715
1716         bch2_btree_update_done(as);
1717
1718         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1719                 up_read(&c->gc_lock);
1720 out:
1721         bch2_btree_trans_verify_locks(trans);
1722
1723         /*
1724          * Don't downgrade locks here: we're called after successful insert,
1725          * and the caller will downgrade locks after a successful insert
1726          * anyways (in case e.g. a split was required first)
1727          *
1728          * And we're also called when inserting into interior nodes in the
1729          * split path, and downgrading to read locks in there is potentially
1730          * confusing:
1731          */
1732         closure_sync(&cl);
1733         return;
1734
1735 err_cycle_gc_lock:
1736         six_unlock_intent(&m->lock);
1737
1738         if (flags & BTREE_INSERT_NOUNLOCK)
1739                 goto out;
1740
1741         bch2_trans_unlock(trans);
1742
1743         down_read(&c->gc_lock);
1744         up_read(&c->gc_lock);
1745         ret = -EINTR;
1746         goto err;
1747
1748 err_unlock:
1749         six_unlock_intent(&m->lock);
1750         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1751                 up_read(&c->gc_lock);
1752 err:
1753         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1754
1755         if ((ret == -EAGAIN || ret == -EINTR) &&
1756             !(flags & BTREE_INSERT_NOUNLOCK)) {
1757                 bch2_trans_unlock(trans);
1758                 closure_sync(&cl);
1759                 ret = bch2_btree_iter_traverse(iter);
1760                 if (ret)
1761                         goto out;
1762
1763                 goto retry;
1764         }
1765
1766         goto out;
1767 }
1768
1769 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1770                                 struct btree *b, unsigned flags,
1771                                 struct closure *cl)
1772 {
1773         struct btree *n, *parent = btree_node_parent(iter, b);
1774         struct btree_update *as;
1775
1776         as = bch2_btree_update_start(c, iter->btree_id,
1777                 (parent
1778                  ? btree_update_reserve_required(c, parent)
1779                  : 0) + 1,
1780                 flags, cl);
1781         if (IS_ERR(as)) {
1782                 trace_btree_gc_rewrite_node_fail(c, b);
1783                 return PTR_ERR(as);
1784         }
1785
1786         bch2_btree_interior_update_will_free_node(as, b);
1787
1788         n = bch2_btree_node_alloc_replacement(as, b);
1789
1790         bch2_btree_build_aux_trees(n);
1791         six_unlock_write(&n->lock);
1792
1793         trace_btree_gc_rewrite_node(c, b);
1794
1795         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1796
1797         if (parent) {
1798                 bch2_keylist_add(&as->parent_keys, &n->key);
1799                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1800         } else {
1801                 bch2_btree_set_root(as, n, iter);
1802         }
1803
1804         bch2_open_buckets_put(c, &n->ob);
1805
1806         six_lock_increment(&b->lock, SIX_LOCK_intent);
1807         bch2_btree_iter_node_drop(iter, b);
1808         bch2_btree_iter_node_replace(iter, n);
1809         bch2_btree_node_free_inmem(c, b, iter);
1810         six_unlock_intent(&n->lock);
1811
1812         bch2_btree_update_done(as);
1813         return 0;
1814 }
1815
1816 /**
1817  * bch_btree_node_rewrite - Rewrite/move a btree node
1818  *
1819  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1820  * btree_check_reserve() has to wait)
1821  */
1822 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1823                             __le64 seq, unsigned flags)
1824 {
1825         struct btree_trans *trans = iter->trans;
1826         struct closure cl;
1827         struct btree *b;
1828         int ret;
1829
1830         flags |= BTREE_INSERT_NOFAIL;
1831
1832         closure_init_stack(&cl);
1833
1834         bch2_btree_iter_upgrade(iter, U8_MAX);
1835
1836         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1837                 if (!down_read_trylock(&c->gc_lock)) {
1838                         bch2_trans_unlock(trans);
1839                         down_read(&c->gc_lock);
1840                 }
1841         }
1842
1843         while (1) {
1844                 ret = bch2_btree_iter_traverse(iter);
1845                 if (ret)
1846                         break;
1847
1848                 b = bch2_btree_iter_peek_node(iter);
1849                 if (!b || b->data->keys.seq != seq)
1850                         break;
1851
1852                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1853                 if (ret != -EAGAIN &&
1854                     ret != -EINTR)
1855                         break;
1856
1857                 bch2_trans_unlock(trans);
1858                 closure_sync(&cl);
1859         }
1860
1861         bch2_btree_iter_downgrade(iter);
1862
1863         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1864                 up_read(&c->gc_lock);
1865
1866         closure_sync(&cl);
1867         return ret;
1868 }
1869
1870 static void __bch2_btree_node_update_key(struct bch_fs *c,
1871                                          struct btree_update *as,
1872                                          struct btree_iter *iter,
1873                                          struct btree *b, struct btree *new_hash,
1874                                          struct bkey_i *new_key)
1875 {
1876         struct btree *parent;
1877         int ret;
1878
1879         /*
1880          * Two corner cases that need to be thought about here:
1881          *
1882          * @b may not be reachable yet - there might be another interior update
1883          * operation waiting on @b to be written, and we're gonna deliver the
1884          * write completion to that interior update operation _before_
1885          * persisting the new_key update
1886          *
1887          * That ends up working without us having to do anything special here:
1888          * the reason is, we do kick off (and do the in memory updates) for the
1889          * update for @new_key before we return, creating a new interior_update
1890          * operation here.
1891          *
1892          * The new interior update operation here will in effect override the
1893          * previous one. The previous one was going to terminate - make @b
1894          * reachable - in one of two ways:
1895          * - updating the btree root pointer
1896          *   In that case,
1897          *   no, this doesn't work. argh.
1898          */
1899
1900         if (b->will_make_reachable)
1901                 as->must_rewrite = true;
1902
1903         btree_interior_update_add_node_reference(as, b);
1904
1905         /*
1906          * XXX: the rest of the update path treats this like we're actually
1907          * inserting a new node and deleting the existing node, so the
1908          * reservation needs to include enough space for @b
1909          *
1910          * that is actually sketch as fuck though and I am surprised the code
1911          * seems to work like that, definitely need to go back and rework it
1912          * into something saner.
1913          *
1914          * (I think @b is just getting double counted until the btree update
1915          * finishes and "deletes" @b on disk)
1916          */
1917         ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
1918                         c->opts.btree_node_size *
1919                         bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
1920                         BCH_DISK_RESERVATION_NOFAIL);
1921         BUG_ON(ret);
1922
1923         parent = btree_node_parent(iter, b);
1924         if (parent) {
1925                 if (new_hash) {
1926                         bkey_copy(&new_hash->key, new_key);
1927                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
1928                                         new_hash, b->level, b->btree_id);
1929                         BUG_ON(ret);
1930                 }
1931
1932                 bch2_keylist_add(&as->parent_keys, new_key);
1933                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
1934
1935                 if (new_hash) {
1936                         mutex_lock(&c->btree_cache.lock);
1937                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
1938
1939                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1940
1941                         bkey_copy(&b->key, new_key);
1942                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1943                         BUG_ON(ret);
1944                         mutex_unlock(&c->btree_cache.lock);
1945                 } else {
1946                         bkey_copy(&b->key, new_key);
1947                 }
1948         } else {
1949                 struct bch_fs_usage *fs_usage;
1950
1951                 BUG_ON(btree_node_root(c, b) != b);
1952
1953                 bch2_btree_node_lock_write(b, iter);
1954
1955                 mutex_lock(&c->btree_interior_update_lock);
1956                 percpu_down_read(&c->mark_lock);
1957                 fs_usage = bch2_fs_usage_scratch_get(c);
1958
1959                 bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
1960                               0, 0, fs_usage, 0,
1961                               BTREE_TRIGGER_INSERT);
1962                 if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
1963                         bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
1964                                              0, 0, NULL, 0,
1965                                              BTREE_TRIGGER_INSERT||
1966                                              BTREE_TRIGGER_GC);
1967
1968                 bch2_btree_node_free_index(as, NULL,
1969                                            bkey_i_to_s_c(&b->key),
1970                                            fs_usage);
1971                 bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res, 0);
1972
1973                 bch2_fs_usage_scratch_put(c, fs_usage);
1974                 percpu_up_read(&c->mark_lock);
1975                 mutex_unlock(&c->btree_interior_update_lock);
1976
1977                 if (btree_ptr_hash_val(new_key) != b->hash_val) {
1978                         mutex_lock(&c->btree_cache.lock);
1979                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1980
1981                         bkey_copy(&b->key, new_key);
1982                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1983                         BUG_ON(ret);
1984                         mutex_unlock(&c->btree_cache.lock);
1985                 } else {
1986                         bkey_copy(&b->key, new_key);
1987                 }
1988
1989                 btree_update_updated_root(as, b);
1990                 bch2_btree_node_unlock_write(b, iter);
1991         }
1992
1993         bch2_btree_update_done(as);
1994 }
1995
1996 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
1997                                struct btree *b,
1998                                struct bkey_i *new_key)
1999 {
2000         struct btree *parent = btree_node_parent(iter, b);
2001         struct btree_update *as = NULL;
2002         struct btree *new_hash = NULL;
2003         struct closure cl;
2004         int ret;
2005
2006         closure_init_stack(&cl);
2007
2008         if (!bch2_btree_iter_upgrade(iter, U8_MAX))
2009                 return -EINTR;
2010
2011         if (!down_read_trylock(&c->gc_lock)) {
2012                 bch2_trans_unlock(iter->trans);
2013                 down_read(&c->gc_lock);
2014
2015                 if (!bch2_trans_relock(iter->trans)) {
2016                         ret = -EINTR;
2017                         goto err;
2018                 }
2019         }
2020
2021         /*
2022          * check btree_ptr_hash_val() after @b is locked by
2023          * btree_iter_traverse():
2024          */
2025         if (btree_ptr_hash_val(new_key) != b->hash_val) {
2026                 /* bch2_btree_reserve_get will unlock */
2027                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2028                 if (ret) {
2029                         bch2_trans_unlock(iter->trans);
2030                         up_read(&c->gc_lock);
2031                         closure_sync(&cl);
2032                         down_read(&c->gc_lock);
2033
2034                         if (!bch2_trans_relock(iter->trans)) {
2035                                 ret = -EINTR;
2036                                 goto err;
2037                         }
2038                 }
2039
2040                 new_hash = bch2_btree_node_mem_alloc(c);
2041         }
2042
2043         as = bch2_btree_update_start(c, iter->btree_id,
2044                 parent ? btree_update_reserve_required(c, parent) : 0,
2045                 BTREE_INSERT_NOFAIL|
2046                 BTREE_INSERT_USE_RESERVE|
2047                 BTREE_INSERT_USE_ALLOC_RESERVE,
2048                 &cl);
2049
2050         if (IS_ERR(as)) {
2051                 ret = PTR_ERR(as);
2052                 if (ret == -EAGAIN)
2053                         ret = -EINTR;
2054
2055                 if (ret != -EINTR)
2056                         goto err;
2057
2058                 bch2_trans_unlock(iter->trans);
2059                 up_read(&c->gc_lock);
2060                 closure_sync(&cl);
2061                 down_read(&c->gc_lock);
2062
2063                 if (!bch2_trans_relock(iter->trans))
2064                         goto err;
2065         }
2066
2067         ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
2068         if (ret)
2069                 goto err_free_update;
2070
2071         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
2072
2073         bch2_btree_iter_downgrade(iter);
2074 err:
2075         if (new_hash) {
2076                 mutex_lock(&c->btree_cache.lock);
2077                 list_move(&new_hash->list, &c->btree_cache.freeable);
2078                 mutex_unlock(&c->btree_cache.lock);
2079
2080                 six_unlock_write(&new_hash->lock);
2081                 six_unlock_intent(&new_hash->lock);
2082         }
2083         up_read(&c->gc_lock);
2084         closure_sync(&cl);
2085         return ret;
2086 err_free_update:
2087         bch2_btree_update_free(as);
2088         goto err;
2089 }
2090
2091 /* Init code: */
2092
2093 /*
2094  * Only for filesystem bringup, when first reading the btree roots or allocating
2095  * btree roots when initializing a new filesystem:
2096  */
2097 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2098 {
2099         BUG_ON(btree_node_root(c, b));
2100
2101         __bch2_btree_set_root_inmem(c, b);
2102 }
2103
2104 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
2105 {
2106         struct closure cl;
2107         struct btree *b;
2108         int ret;
2109
2110         closure_init_stack(&cl);
2111
2112         do {
2113                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2114                 closure_sync(&cl);
2115         } while (ret);
2116
2117         b = bch2_btree_node_mem_alloc(c);
2118         bch2_btree_cache_cannibalize_unlock(c);
2119
2120         set_btree_node_fake(b);
2121         b->level        = 0;
2122         b->btree_id     = id;
2123
2124         bkey_btree_ptr_init(&b->key);
2125         b->key.k.p = POS_MAX;
2126         *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2127
2128         bch2_bset_init_first(b, &b->data->keys);
2129         bch2_btree_build_aux_trees(b);
2130
2131         b->data->flags = 0;
2132         btree_set_min(b, POS_MIN);
2133         btree_set_max(b, POS_MAX);
2134         b->data->format = bch2_btree_calc_format(b);
2135         btree_node_set_format(b, b->data->format);
2136
2137         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
2138         BUG_ON(ret);
2139
2140         __bch2_btree_set_root_inmem(c, b);
2141
2142         six_unlock_write(&b->lock);
2143         six_unlock_intent(&b->lock);
2144 }
2145
2146 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
2147 {
2148         struct printbuf out = _PBUF(buf, PAGE_SIZE);
2149         struct btree_update *as;
2150
2151         mutex_lock(&c->btree_interior_update_lock);
2152         list_for_each_entry(as, &c->btree_interior_update_list, list)
2153                 pr_buf(&out, "%p m %u w %u r %u j %llu\n",
2154                        as,
2155                        as->mode,
2156                        as->nodes_written,
2157                        atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
2158                        as->journal.seq);
2159         mutex_unlock(&c->btree_interior_update_lock);
2160
2161         return out.pos - buf;
2162 }
2163
2164 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
2165 {
2166         size_t ret = 0;
2167         struct list_head *i;
2168
2169         mutex_lock(&c->btree_interior_update_lock);
2170         list_for_each(i, &c->btree_interior_update_list)
2171                 ret++;
2172         mutex_unlock(&c->btree_interior_update_lock);
2173
2174         return ret;
2175 }