]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
Update bcachefs sources to 2cb70a82bc bcachefs: delete some debug code
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1
2 #include "bcachefs.h"
3 #include "alloc.h"
4 #include "bkey_methods.h"
5 #include "btree_cache.h"
6 #include "btree_gc.h"
7 #include "btree_update.h"
8 #include "btree_update_interior.h"
9 #include "btree_io.h"
10 #include "btree_iter.h"
11 #include "btree_locking.h"
12 #include "buckets.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "journal_reclaim.h"
16 #include "keylist.h"
17 #include "replicas.h"
18 #include "super-io.h"
19
20 #include <linux/random.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_node_will_make_reachable(struct btree_update *,
24                                            struct btree *);
25 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
26 static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
27
28 /* Debug code: */
29
30 static void btree_node_interior_verify(struct btree *b)
31 {
32         struct btree_node_iter iter;
33         struct bkey_packed *k;
34
35         BUG_ON(!b->level);
36
37         bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
38 #if 1
39         BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
40                bkey_cmp_left_packed(b, k, &b->key.k.p));
41
42         BUG_ON((bch2_btree_node_iter_advance(&iter, b),
43                 !bch2_btree_node_iter_end(&iter)));
44 #else
45         const char *msg;
46
47         msg = "not found";
48         k = bch2_btree_node_iter_peek(&iter, b);
49         if (!k)
50                 goto err;
51
52         msg = "isn't what it should be";
53         if (bkey_cmp_left_packed(b, k, &b->key.k.p))
54                 goto err;
55
56         bch2_btree_node_iter_advance(&iter, b);
57
58         msg = "isn't last key";
59         if (!bch2_btree_node_iter_end(&iter))
60                 goto err;
61         return;
62 err:
63         bch2_dump_btree_node(b);
64         printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
65                b->key.k.p.offset, msg);
66         BUG();
67 #endif
68 }
69
70 /* Calculate ideal packed bkey format for new btree nodes: */
71
72 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
73 {
74         struct bkey_packed *k;
75         struct bset_tree *t;
76         struct bkey uk;
77
78         bch2_bkey_format_add_pos(s, b->data->min_key);
79
80         for_each_bset(b, t)
81                 for (k = btree_bkey_first(b, t);
82                      k != btree_bkey_last(b, t);
83                      k = bkey_next(k))
84                         if (!bkey_whiteout(k)) {
85                                 uk = bkey_unpack_key(b, k);
86                                 bch2_bkey_format_add_key(s, &uk);
87                         }
88 }
89
90 static struct bkey_format bch2_btree_calc_format(struct btree *b)
91 {
92         struct bkey_format_state s;
93
94         bch2_bkey_format_init(&s);
95         __bch2_btree_calc_format(&s, b);
96
97         return bch2_bkey_format_done(&s);
98 }
99
100 static size_t btree_node_u64s_with_format(struct btree *b,
101                                           struct bkey_format *new_f)
102 {
103         struct bkey_format *old_f = &b->format;
104
105         /* stupid integer promotion rules */
106         ssize_t delta =
107             (((int) new_f->key_u64s - old_f->key_u64s) *
108              (int) b->nr.packed_keys) +
109             (((int) new_f->key_u64s - BKEY_U64s) *
110              (int) b->nr.unpacked_keys);
111
112         BUG_ON(delta + b->nr.live_u64s < 0);
113
114         return b->nr.live_u64s + delta;
115 }
116
117 /**
118  * btree_node_format_fits - check if we could rewrite node with a new format
119  *
120  * This assumes all keys can pack with the new format -- it just checks if
121  * the re-packed keys would fit inside the node itself.
122  */
123 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
124                                  struct bkey_format *new_f)
125 {
126         size_t u64s = btree_node_u64s_with_format(b, new_f);
127
128         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
129 }
130
131 /* Btree node freeing/allocation: */
132
133 static bool btree_key_matches(struct bch_fs *c,
134                               struct bkey_s_c_extent l,
135                               struct bkey_s_c_extent r)
136 {
137         const struct bch_extent_ptr *ptr1, *ptr2;
138
139         extent_for_each_ptr(l, ptr1)
140                 extent_for_each_ptr(r, ptr2)
141                         if (ptr1->dev == ptr2->dev &&
142                             ptr1->gen == ptr2->gen &&
143                             ptr1->offset == ptr2->offset)
144                                 return true;
145
146         return false;
147 }
148
149 /*
150  * We're doing the index update that makes @b unreachable, update stuff to
151  * reflect that:
152  *
153  * Must be called _before_ btree_update_updated_root() or
154  * btree_update_updated_node:
155  */
156 static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
157                                        struct bkey_s_c k,
158                                        struct bch_fs_usage *stats)
159 {
160         struct bch_fs *c = as->c;
161         struct pending_btree_node_free *d;
162         unsigned replicas;
163
164         /*
165          * btree_update lock is only needed here to avoid racing with
166          * gc:
167          */
168         mutex_lock(&c->btree_interior_update_lock);
169
170         for (d = as->pending; d < as->pending + as->nr_pending; d++)
171                 if (!bkey_cmp(k.k->p, d->key.k.p) &&
172                     btree_key_matches(c, bkey_s_c_to_extent(k),
173                                       bkey_i_to_s_c_extent(&d->key)))
174                         goto found;
175         BUG();
176 found:
177         BUG_ON(d->index_update_done);
178         d->index_update_done = true;
179
180         /*
181          * Btree nodes are accounted as freed in bch_alloc_stats when they're
182          * freed from the index:
183          */
184         replicas = bch2_extent_nr_dirty_ptrs(k);
185         if (replicas)
186                 stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
187
188         /*
189          * We're dropping @k from the btree, but it's still live until the
190          * index update is persistent so we need to keep a reference around for
191          * mark and sweep to find - that's primarily what the
192          * btree_node_pending_free list is for.
193          *
194          * So here (when we set index_update_done = true), we're moving an
195          * existing reference to a different part of the larger "gc keyspace" -
196          * and the new position comes after the old position, since GC marks
197          * the pending free list after it walks the btree.
198          *
199          * If we move the reference while mark and sweep is _between_ the old
200          * and the new position, mark and sweep will see the reference twice
201          * and it'll get double accounted - so check for that here and subtract
202          * to cancel out one of mark and sweep's markings if necessary:
203          */
204
205         /*
206          * bch2_mark_key() compares the current gc pos to the pos we're
207          * moving this reference from, hence one comparison here:
208          */
209         if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
210                 struct bch_fs_usage tmp = { 0 };
211
212                 bch2_mark_key(c, bkey_i_to_s_c(&d->key),
213                              -c->opts.btree_node_size, true, b
214                              ? gc_pos_btree_node(b)
215                              : gc_pos_btree_root(as->btree_id),
216                              &tmp, 0, 0);
217                 /*
218                  * Don't apply tmp - pending deletes aren't tracked in
219                  * bch_alloc_stats:
220                  */
221         }
222
223         mutex_unlock(&c->btree_interior_update_lock);
224 }
225
226 static void __btree_node_free(struct bch_fs *c, struct btree *b)
227 {
228         trace_btree_node_free(c, b);
229
230         BUG_ON(btree_node_dirty(b));
231         BUG_ON(btree_node_need_write(b));
232         BUG_ON(b == btree_node_root(c, b));
233         BUG_ON(b->ob.nr);
234         BUG_ON(!list_empty(&b->write_blocked));
235         BUG_ON(b->will_make_reachable);
236
237         clear_btree_node_noevict(b);
238
239         bch2_btree_node_hash_remove(&c->btree_cache, b);
240
241         mutex_lock(&c->btree_cache.lock);
242         list_move(&b->list, &c->btree_cache.freeable);
243         mutex_unlock(&c->btree_cache.lock);
244 }
245
246 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
247 {
248         struct btree_ob_ref ob = b->ob;
249
250         btree_update_drop_new_node(c, b);
251
252         b->ob.nr = 0;
253
254         clear_btree_node_dirty(b);
255
256         btree_node_lock_type(c, b, SIX_LOCK_write);
257         __btree_node_free(c, b);
258         six_unlock_write(&b->lock);
259
260         bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
261 }
262
263 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
264                                 struct btree_iter *iter)
265 {
266         /*
267          * Is this a node that isn't reachable on disk yet?
268          *
269          * Nodes that aren't reachable yet have writes blocked until they're
270          * reachable - now that we've cancelled any pending writes and moved
271          * things waiting on that write to wait on this update, we can drop this
272          * node from the list of nodes that the other update is making
273          * reachable, prior to freeing it:
274          */
275         btree_update_drop_new_node(c, b);
276
277         __bch2_btree_node_lock_write(b, iter);
278         __btree_node_free(c, b);
279         six_unlock_write(&b->lock);
280
281         bch2_btree_iter_node_drop(iter, b);
282 }
283
284 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
285                                         struct pending_btree_node_free *pending)
286 {
287         struct bch_fs_usage stats = { 0 };
288
289         BUG_ON(!pending->index_update_done);
290
291         bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
292                      -c->opts.btree_node_size, true,
293                      gc_phase(GC_PHASE_PENDING_DELETE),
294                      &stats, 0, 0);
295         /*
296          * Don't apply stats - pending deletes aren't tracked in
297          * bch_alloc_stats:
298          */
299 }
300
301 void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
302 {
303         bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
304 }
305
306 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
307                                              struct disk_reservation *res,
308                                              struct closure *cl,
309                                              unsigned flags)
310 {
311         struct write_point *wp;
312         struct btree *b;
313         BKEY_PADDED(k) tmp;
314         struct bkey_i_extent *e;
315         struct btree_ob_ref ob;
316         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
317         unsigned nr_reserve;
318         enum alloc_reserve alloc_reserve;
319
320         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
321                 nr_reserve      = 0;
322                 alloc_reserve   = RESERVE_ALLOC;
323         } else if (flags & BTREE_INSERT_USE_RESERVE) {
324                 nr_reserve      = BTREE_NODE_RESERVE / 2;
325                 alloc_reserve   = RESERVE_BTREE;
326         } else {
327                 nr_reserve      = BTREE_NODE_RESERVE;
328                 alloc_reserve   = RESERVE_NONE;
329         }
330
331         mutex_lock(&c->btree_reserve_cache_lock);
332         if (c->btree_reserve_cache_nr > nr_reserve) {
333                 struct btree_alloc *a =
334                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
335
336                 ob = a->ob;
337                 bkey_copy(&tmp.k, &a->k);
338                 mutex_unlock(&c->btree_reserve_cache_lock);
339                 goto mem_alloc;
340         }
341         mutex_unlock(&c->btree_reserve_cache_lock);
342
343 retry:
344         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
345                                       writepoint_ptr(&c->btree_write_point),
346                                       &devs_have,
347                                       res->nr_replicas,
348                                       c->opts.metadata_replicas_required,
349                                       alloc_reserve, 0, cl);
350         if (IS_ERR(wp))
351                 return ERR_CAST(wp);
352
353         if (wp->sectors_free < c->opts.btree_node_size) {
354                 struct open_bucket *ob;
355                 unsigned i;
356
357                 writepoint_for_each_ptr(wp, ob, i)
358                         if (ob->sectors_free < c->opts.btree_node_size)
359                                 ob->sectors_free = 0;
360
361                 bch2_alloc_sectors_done(c, wp);
362                 goto retry;
363         }
364
365         e = bkey_extent_init(&tmp.k);
366         bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
367
368         ob.nr = 0;
369         bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
370         bch2_alloc_sectors_done(c, wp);
371 mem_alloc:
372         b = bch2_btree_node_mem_alloc(c);
373
374         /* we hold cannibalize_lock: */
375         BUG_ON(IS_ERR(b));
376         BUG_ON(b->ob.nr);
377
378         bkey_copy(&b->key, &tmp.k);
379         b->ob = ob;
380
381         return b;
382 }
383
384 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
385 {
386         struct bch_fs *c = as->c;
387         struct btree *b;
388
389         BUG_ON(level >= BTREE_MAX_DEPTH);
390         BUG_ON(!as->reserve->nr);
391
392         b = as->reserve->b[--as->reserve->nr];
393
394         BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
395
396         set_btree_node_accessed(b);
397         set_btree_node_dirty(b);
398
399         bch2_bset_init_first(b, &b->data->keys);
400         memset(&b->nr, 0, sizeof(b->nr));
401         b->data->magic = cpu_to_le64(bset_magic(c));
402         b->data->flags = 0;
403         SET_BTREE_NODE_ID(b->data, as->btree_id);
404         SET_BTREE_NODE_LEVEL(b->data, level);
405         b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
406
407         bch2_btree_build_aux_trees(b);
408
409         btree_node_will_make_reachable(as, b);
410
411         trace_btree_node_alloc(c, b);
412         return b;
413 }
414
415 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
416                                                   struct btree *b,
417                                                   struct bkey_format format)
418 {
419         struct btree *n;
420
421         n = bch2_btree_node_alloc(as, b->level);
422
423         n->data->min_key        = b->data->min_key;
424         n->data->max_key        = b->data->max_key;
425         n->data->format         = format;
426         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
427
428         btree_node_set_format(n, format);
429
430         bch2_btree_sort_into(as->c, n, b);
431
432         btree_node_reset_sib_u64s(n);
433
434         n->key.k.p = b->key.k.p;
435         return n;
436 }
437
438 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
439                                                        struct btree *b)
440 {
441         struct bkey_format new_f = bch2_btree_calc_format(b);
442
443         /*
444          * The keys might expand with the new format - if they wouldn't fit in
445          * the btree node anymore, use the old format for now:
446          */
447         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
448                 new_f = b->format;
449
450         return __bch2_btree_node_alloc_replacement(as, b, new_f);
451 }
452
453 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
454 {
455         struct btree *b = bch2_btree_node_alloc(as, level);
456
457         b->data->min_key = POS_MIN;
458         b->data->max_key = POS_MAX;
459         b->data->format = bch2_btree_calc_format(b);
460         b->key.k.p = POS_MAX;
461
462         btree_node_set_format(b, b->data->format);
463         bch2_btree_build_aux_trees(b);
464
465         six_unlock_write(&b->lock);
466
467         return b;
468 }
469
470 static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
471 {
472         bch2_disk_reservation_put(c, &reserve->disk_res);
473
474         mutex_lock(&c->btree_reserve_cache_lock);
475
476         while (reserve->nr) {
477                 struct btree *b = reserve->b[--reserve->nr];
478
479                 six_unlock_write(&b->lock);
480
481                 if (c->btree_reserve_cache_nr <
482                     ARRAY_SIZE(c->btree_reserve_cache)) {
483                         struct btree_alloc *a =
484                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
485
486                         a->ob = b->ob;
487                         b->ob.nr = 0;
488                         bkey_copy(&a->k, &b->key);
489                 } else {
490                         bch2_btree_open_bucket_put(c, b);
491                 }
492
493                 btree_node_lock_type(c, b, SIX_LOCK_write);
494                 __btree_node_free(c, b);
495                 six_unlock_write(&b->lock);
496
497                 six_unlock_intent(&b->lock);
498         }
499
500         mutex_unlock(&c->btree_reserve_cache_lock);
501
502         mempool_free(reserve, &c->btree_reserve_pool);
503 }
504
505 static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
506                                                     unsigned nr_nodes,
507                                                     unsigned flags,
508                                                     struct closure *cl)
509 {
510         struct btree_reserve *reserve;
511         struct btree *b;
512         struct disk_reservation disk_res = { 0, 0 };
513         unsigned sectors = nr_nodes * c->opts.btree_node_size;
514         int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
515
516         if (flags & BTREE_INSERT_NOFAIL)
517                 disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
518
519         /*
520          * This check isn't necessary for correctness - it's just to potentially
521          * prevent us from doing a lot of work that'll end up being wasted:
522          */
523         ret = bch2_journal_error(&c->journal);
524         if (ret)
525                 return ERR_PTR(ret);
526
527         if (bch2_disk_reservation_get(c, &disk_res, sectors,
528                                       c->opts.metadata_replicas,
529                                       disk_res_flags))
530                 return ERR_PTR(-ENOSPC);
531
532         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
533
534         /*
535          * Protects reaping from the btree node cache and using the btree node
536          * open bucket reserve:
537          */
538         ret = bch2_btree_cache_cannibalize_lock(c, cl);
539         if (ret) {
540                 bch2_disk_reservation_put(c, &disk_res);
541                 return ERR_PTR(ret);
542         }
543
544         reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
545
546         reserve->disk_res = disk_res;
547         reserve->nr = 0;
548
549         while (reserve->nr < nr_nodes) {
550                 b = __bch2_btree_node_alloc(c, &disk_res,
551                                             flags & BTREE_INSERT_NOWAIT
552                                             ? NULL : cl, flags);
553                 if (IS_ERR(b)) {
554                         ret = PTR_ERR(b);
555                         goto err_free;
556                 }
557
558                 ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
559                                               bkey_i_to_s_c(&b->key));
560                 if (ret)
561                         goto err_free;
562
563                 reserve->b[reserve->nr++] = b;
564         }
565
566         bch2_btree_cache_cannibalize_unlock(c);
567         return reserve;
568 err_free:
569         bch2_btree_reserve_put(c, reserve);
570         bch2_btree_cache_cannibalize_unlock(c);
571         trace_btree_reserve_get_fail(c, nr_nodes, cl);
572         return ERR_PTR(ret);
573 }
574
575 /* Asynchronous interior node update machinery */
576
577 static void bch2_btree_update_free(struct btree_update *as)
578 {
579         struct bch_fs *c = as->c;
580
581         BUG_ON(as->nr_new_nodes);
582         BUG_ON(as->nr_pending);
583
584         if (as->reserve)
585                 bch2_btree_reserve_put(c, as->reserve);
586
587         mutex_lock(&c->btree_interior_update_lock);
588         list_del(&as->list);
589
590         closure_debug_destroy(&as->cl);
591         mempool_free(as, &c->btree_interior_update_pool);
592         percpu_ref_put(&c->writes);
593
594         closure_wake_up(&c->btree_interior_update_wait);
595         mutex_unlock(&c->btree_interior_update_lock);
596 }
597
598 static void btree_update_nodes_reachable(struct closure *cl)
599 {
600         struct btree_update *as = container_of(cl, struct btree_update, cl);
601         struct bch_fs *c = as->c;
602
603         bch2_journal_pin_drop(&c->journal, &as->journal);
604
605         mutex_lock(&c->btree_interior_update_lock);
606
607         while (as->nr_new_nodes) {
608                 struct btree *b = as->new_nodes[--as->nr_new_nodes];
609
610                 BUG_ON(b->will_make_reachable != (unsigned long) as);
611                 b->will_make_reachable = 0;
612                 mutex_unlock(&c->btree_interior_update_lock);
613
614                 /*
615                  * b->will_make_reachable prevented it from being written, so
616                  * write it now if it needs to be written:
617                  */
618                 btree_node_lock_type(c, b, SIX_LOCK_read);
619                 bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
620                 six_unlock_read(&b->lock);
621                 mutex_lock(&c->btree_interior_update_lock);
622         }
623
624         while (as->nr_pending)
625                 bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
626
627         mutex_unlock(&c->btree_interior_update_lock);
628
629         closure_wake_up(&as->wait);
630
631         bch2_btree_update_free(as);
632 }
633
634 static void btree_update_wait_on_journal(struct closure *cl)
635 {
636         struct btree_update *as = container_of(cl, struct btree_update, cl);
637         struct bch_fs *c = as->c;
638         int ret;
639
640         ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
641         if (ret < 0)
642                 goto err;
643         if (!ret) {
644                 continue_at(cl, btree_update_wait_on_journal, system_wq);
645                 return;
646         }
647
648         bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
649 err:
650         continue_at(cl, btree_update_nodes_reachable, system_wq);
651 }
652
653 static void btree_update_nodes_written(struct closure *cl)
654 {
655         struct btree_update *as = container_of(cl, struct btree_update, cl);
656         struct bch_fs *c = as->c;
657         struct btree *b;
658
659         /*
660          * We did an update to a parent node where the pointers we added pointed
661          * to child nodes that weren't written yet: now, the child nodes have
662          * been written so we can write out the update to the interior node.
663          */
664 retry:
665         mutex_lock(&c->btree_interior_update_lock);
666         as->nodes_written = true;
667
668         switch (as->mode) {
669         case BTREE_INTERIOR_NO_UPDATE:
670                 BUG();
671         case BTREE_INTERIOR_UPDATING_NODE:
672                 /* The usual case: */
673                 b = READ_ONCE(as->b);
674
675                 if (!six_trylock_read(&b->lock)) {
676                         mutex_unlock(&c->btree_interior_update_lock);
677                         btree_node_lock_type(c, b, SIX_LOCK_read);
678                         six_unlock_read(&b->lock);
679                         goto retry;
680                 }
681
682                 BUG_ON(!btree_node_dirty(b));
683                 closure_wait(&btree_current_write(b)->wait, cl);
684
685                 list_del(&as->write_blocked_list);
686                 mutex_unlock(&c->btree_interior_update_lock);
687
688                 /*
689                  * b->write_blocked prevented it from being written, so
690                  * write it now if it needs to be written:
691                  */
692                 bch2_btree_node_write_cond(c, b, true);
693                 six_unlock_read(&b->lock);
694                 break;
695
696         case BTREE_INTERIOR_UPDATING_AS:
697                 /*
698                  * The btree node we originally updated has been freed and is
699                  * being rewritten - so we need to write anything here, we just
700                  * need to signal to that btree_update that it's ok to make the
701                  * new replacement node visible:
702                  */
703                 closure_put(&as->parent_as->cl);
704
705                 /*
706                  * and then we have to wait on that btree_update to finish:
707                  */
708                 closure_wait(&as->parent_as->wait, cl);
709                 mutex_unlock(&c->btree_interior_update_lock);
710                 break;
711
712         case BTREE_INTERIOR_UPDATING_ROOT:
713                 /* b is the new btree root: */
714                 b = READ_ONCE(as->b);
715
716                 if (!six_trylock_read(&b->lock)) {
717                         mutex_unlock(&c->btree_interior_update_lock);
718                         btree_node_lock_type(c, b, SIX_LOCK_read);
719                         six_unlock_read(&b->lock);
720                         goto retry;
721                 }
722
723                 BUG_ON(c->btree_roots[b->btree_id].as != as);
724                 c->btree_roots[b->btree_id].as = NULL;
725
726                 bch2_btree_set_root_ondisk(c, b, WRITE);
727
728                 /*
729                  * We don't have to wait anything anything here (before
730                  * btree_update_nodes_reachable frees the old nodes
731                  * ondisk) - we've ensured that the very next journal write will
732                  * have the pointer to the new root, and before the allocator
733                  * can reuse the old nodes it'll have to do a journal commit:
734                  */
735                 six_unlock_read(&b->lock);
736                 mutex_unlock(&c->btree_interior_update_lock);
737
738                 /*
739                  * Bit of funny circularity going on here we have to break:
740                  *
741                  * We have to drop our journal pin before writing the journal
742                  * entry that points to the new btree root: else, we could
743                  * deadlock if the journal currently happens to be full.
744                  *
745                  * This mean we're dropping the journal pin _before_ the new
746                  * nodes are technically reachable - but this is safe, because
747                  * after the bch2_btree_set_root_ondisk() call above they will
748                  * be reachable as of the very next journal write:
749                  */
750                 bch2_journal_pin_drop(&c->journal, &as->journal);
751
752                 as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
753
754                 btree_update_wait_on_journal(cl);
755                 return;
756         }
757
758         continue_at(cl, btree_update_nodes_reachable, system_wq);
759 }
760
761 /*
762  * We're updating @b with pointers to nodes that haven't finished writing yet:
763  * block @b from being written until @as completes
764  */
765 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
766 {
767         struct bch_fs *c = as->c;
768
769         mutex_lock(&c->btree_interior_update_lock);
770
771         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
772         BUG_ON(!btree_node_dirty(b));
773
774         as->mode = BTREE_INTERIOR_UPDATING_NODE;
775         as->b = b;
776         list_add(&as->write_blocked_list, &b->write_blocked);
777
778         mutex_unlock(&c->btree_interior_update_lock);
779
780         /*
781          * In general, when you're staging things in a journal that will later
782          * be written elsewhere, and you also want to guarantee ordering: that
783          * is, if you have updates a, b, c, after a crash you should never see c
784          * and not a or b - there's a problem:
785          *
786          * If the final destination of the update(s) (i.e. btree node) can be
787          * written/flushed _before_ the relevant journal entry - oops, that
788          * breaks ordering, since the various leaf nodes can be written in any
789          * order.
790          *
791          * Normally we use bset->journal_seq to deal with this - if during
792          * recovery we find a btree node write that's newer than the newest
793          * journal entry, we just ignore it - we don't need it, anything we're
794          * supposed to have (that we reported as completed via fsync()) will
795          * still be in the journal, and as far as the state of the journal is
796          * concerned that btree node write never happened.
797          *
798          * That breaks when we're rewriting/splitting/merging nodes, since we're
799          * mixing btree node writes that haven't happened yet with previously
800          * written data that has been reported as completed to the journal.
801          *
802          * Thus, before making the new nodes reachable, we have to wait the
803          * newest journal sequence number we have data for to be written (if it
804          * hasn't been yet).
805          */
806         bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
807 }
808
809 static void interior_update_flush(struct journal *j,
810                         struct journal_entry_pin *pin, u64 seq)
811 {
812         struct btree_update *as =
813                 container_of(pin, struct btree_update, journal);
814
815         bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
816 }
817
818 static void btree_update_reparent(struct btree_update *as,
819                                   struct btree_update *child)
820 {
821         struct bch_fs *c = as->c;
822
823         child->b = NULL;
824         child->mode = BTREE_INTERIOR_UPDATING_AS;
825         child->parent_as = as;
826         closure_get(&as->cl);
827
828         /*
829          * When we write a new btree root, we have to drop our journal pin
830          * _before_ the new nodes are technically reachable; see
831          * btree_update_nodes_written().
832          *
833          * This goes for journal pins that are recursively blocked on us - so,
834          * just transfer the journal pin to the new interior update so
835          * btree_update_nodes_written() can drop it.
836          */
837         bch2_journal_pin_add_if_older(&c->journal, &child->journal,
838                                       &as->journal, interior_update_flush);
839         bch2_journal_pin_drop(&c->journal, &child->journal);
840
841         as->journal_seq = max(as->journal_seq, child->journal_seq);
842 }
843
844 static void btree_update_updated_root(struct btree_update *as)
845 {
846         struct bch_fs *c = as->c;
847         struct btree_root *r = &c->btree_roots[as->btree_id];
848
849         mutex_lock(&c->btree_interior_update_lock);
850
851         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
852
853         /*
854          * Old root might not be persistent yet - if so, redirect its
855          * btree_update operation to point to us:
856          */
857         if (r->as)
858                 btree_update_reparent(as, r->as);
859
860         as->mode = BTREE_INTERIOR_UPDATING_ROOT;
861         as->b = r->b;
862         r->as = as;
863
864         mutex_unlock(&c->btree_interior_update_lock);
865
866         /*
867          * When we're rewriting nodes and updating interior nodes, there's an
868          * issue with updates that haven't been written in the journal getting
869          * mixed together with older data - see btree_update_updated_node()
870          * for the explanation.
871          *
872          * However, this doesn't affect us when we're writing a new btree root -
873          * because to make that new root reachable we have to write out a new
874          * journal entry, which must necessarily be newer than as->journal_seq.
875          */
876 }
877
878 static void btree_node_will_make_reachable(struct btree_update *as,
879                                            struct btree *b)
880 {
881         struct bch_fs *c = as->c;
882
883         mutex_lock(&c->btree_interior_update_lock);
884         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
885         BUG_ON(b->will_make_reachable);
886
887         as->new_nodes[as->nr_new_nodes++] = b;
888         b->will_make_reachable = 1UL|(unsigned long) as;
889
890         closure_get(&as->cl);
891         mutex_unlock(&c->btree_interior_update_lock);
892 }
893
894 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
895 {
896         struct btree_update *as;
897         unsigned long v;
898         unsigned i;
899
900         mutex_lock(&c->btree_interior_update_lock);
901         v = xchg(&b->will_make_reachable, 0);
902         as = (struct btree_update *) (v & ~1UL);
903
904         if (!as) {
905                 mutex_unlock(&c->btree_interior_update_lock);
906                 return;
907         }
908
909         for (i = 0; i < as->nr_new_nodes; i++)
910                 if (as->new_nodes[i] == b)
911                         goto found;
912
913         BUG();
914 found:
915         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
916         mutex_unlock(&c->btree_interior_update_lock);
917
918         if (v & 1)
919                 closure_put(&as->cl);
920 }
921
922 static void btree_interior_update_add_node_reference(struct btree_update *as,
923                                                      struct btree *b)
924 {
925         struct bch_fs *c = as->c;
926         struct pending_btree_node_free *d;
927
928         mutex_lock(&c->btree_interior_update_lock);
929
930         /* Add this node to the list of nodes being freed: */
931         BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
932
933         d = &as->pending[as->nr_pending++];
934         d->index_update_done    = false;
935         d->seq                  = b->data->keys.seq;
936         d->btree_id             = b->btree_id;
937         d->level                = b->level;
938         bkey_copy(&d->key, &b->key);
939
940         mutex_unlock(&c->btree_interior_update_lock);
941 }
942
943 /*
944  * @b is being split/rewritten: it may have pointers to not-yet-written btree
945  * nodes and thus outstanding btree_updates - redirect @b's
946  * btree_updates to point to this btree_update:
947  */
948 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
949                                                struct btree *b)
950 {
951         struct bch_fs *c = as->c;
952         struct closure *cl, *cl_n;
953         struct btree_update *p, *n;
954         struct btree_write *w;
955         struct bset_tree *t;
956
957         set_btree_node_dying(b);
958
959         if (btree_node_fake(b))
960                 return;
961
962         btree_interior_update_add_node_reference(as, b);
963
964         /*
965          * Does this node have data that hasn't been written in the journal?
966          *
967          * If so, we have to wait for the corresponding journal entry to be
968          * written before making the new nodes reachable - we can't just carry
969          * over the bset->journal_seq tracking, since we'll be mixing those keys
970          * in with keys that aren't in the journal anymore:
971          */
972         for_each_bset(b, t)
973                 as->journal_seq = max(as->journal_seq,
974                                       le64_to_cpu(bset(b, t)->journal_seq));
975
976         mutex_lock(&c->btree_interior_update_lock);
977
978         /*
979          * Does this node have any btree_update operations preventing
980          * it from being written?
981          *
982          * If so, redirect them to point to this btree_update: we can
983          * write out our new nodes, but we won't make them visible until those
984          * operations complete
985          */
986         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
987                 list_del(&p->write_blocked_list);
988                 btree_update_reparent(as, p);
989         }
990
991         clear_btree_node_dirty(b);
992         clear_btree_node_need_write(b);
993         w = btree_current_write(b);
994
995         /*
996          * Does this node have any btree_update operations waiting on this node
997          * to be written?
998          *
999          * If so, wake them up when this btree_update operation is reachable:
1000          */
1001         llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
1002                 llist_add(&cl->list, &as->wait.list);
1003
1004         /*
1005          * Does this node have unwritten data that has a pin on the journal?
1006          *
1007          * If so, transfer that pin to the btree_update operation -
1008          * note that if we're freeing multiple nodes, we only need to keep the
1009          * oldest pin of any of the nodes we're freeing. We'll release the pin
1010          * when the new nodes are persistent and reachable on disk:
1011          */
1012         bch2_journal_pin_add_if_older(&c->journal, &w->journal,
1013                                       &as->journal, interior_update_flush);
1014         bch2_journal_pin_drop(&c->journal, &w->journal);
1015
1016         w = btree_prev_write(b);
1017         bch2_journal_pin_add_if_older(&c->journal, &w->journal,
1018                                       &as->journal, interior_update_flush);
1019         bch2_journal_pin_drop(&c->journal, &w->journal);
1020
1021         mutex_unlock(&c->btree_interior_update_lock);
1022 }
1023
1024 void bch2_btree_update_done(struct btree_update *as)
1025 {
1026         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
1027
1028         bch2_btree_reserve_put(as->c, as->reserve);
1029         as->reserve = NULL;
1030
1031         continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
1032 }
1033
1034 struct btree_update *
1035 bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
1036                         unsigned nr_nodes, unsigned flags,
1037                         struct closure *cl)
1038 {
1039         struct btree_reserve *reserve;
1040         struct btree_update *as;
1041
1042         if (unlikely(!percpu_ref_tryget(&c->writes)))
1043                 return ERR_PTR(-EROFS);
1044
1045         reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
1046         if (IS_ERR(reserve)) {
1047                 percpu_ref_put(&c->writes);
1048                 return ERR_CAST(reserve);
1049         }
1050
1051         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
1052         memset(as, 0, sizeof(*as));
1053         closure_init(&as->cl, NULL);
1054         as->c           = c;
1055         as->mode        = BTREE_INTERIOR_NO_UPDATE;
1056         as->btree_id    = id;
1057         as->reserve     = reserve;
1058         INIT_LIST_HEAD(&as->write_blocked_list);
1059
1060         bch2_keylist_init(&as->parent_keys, as->inline_keys);
1061
1062         mutex_lock(&c->btree_interior_update_lock);
1063         list_add_tail(&as->list, &c->btree_interior_update_list);
1064         mutex_unlock(&c->btree_interior_update_lock);
1065
1066         return as;
1067 }
1068
1069 /* Btree root updates: */
1070
1071 static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1072 {
1073         /* Root nodes cannot be reaped */
1074         mutex_lock(&c->btree_cache.lock);
1075         list_del_init(&b->list);
1076         mutex_unlock(&c->btree_cache.lock);
1077
1078         mutex_lock(&c->btree_root_lock);
1079         BUG_ON(btree_node_root(c, b) &&
1080                (b->level < btree_node_root(c, b)->level ||
1081                 !btree_node_dying(btree_node_root(c, b))));
1082
1083         btree_node_root(c, b) = b;
1084         mutex_unlock(&c->btree_root_lock);
1085
1086         bch2_recalc_btree_reserve(c);
1087 }
1088
1089 static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
1090 {
1091         struct bch_fs *c = as->c;
1092         struct btree *old = btree_node_root(c, b);
1093         struct bch_fs_usage stats = { 0 };
1094
1095         __bch2_btree_set_root_inmem(c, b);
1096
1097         bch2_mark_key(c, bkey_i_to_s_c(&b->key),
1098                       c->opts.btree_node_size, true,
1099                       gc_pos_btree_root(b->btree_id),
1100                       &stats, 0, 0);
1101
1102         if (old && !btree_node_fake(old))
1103                 bch2_btree_node_free_index(as, NULL,
1104                                            bkey_i_to_s_c(&old->key),
1105                                            &stats);
1106         bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1107                             gc_pos_btree_root(b->btree_id));
1108 }
1109
1110 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
1111 {
1112         struct btree_root *r = &c->btree_roots[b->btree_id];
1113
1114         mutex_lock(&c->btree_root_lock);
1115
1116         BUG_ON(b != r->b);
1117         bkey_copy(&r->key, &b->key);
1118         r->level = b->level;
1119         r->alive = true;
1120         if (rw == WRITE)
1121                 c->btree_roots_dirty = true;
1122
1123         mutex_unlock(&c->btree_root_lock);
1124 }
1125
1126 /**
1127  * bch_btree_set_root - update the root in memory and on disk
1128  *
1129  * To ensure forward progress, the current task must not be holding any
1130  * btree node write locks. However, you must hold an intent lock on the
1131  * old root.
1132  *
1133  * Note: This allocates a journal entry but doesn't add any keys to
1134  * it.  All the btree roots are part of every journal write, so there
1135  * is nothing new to be done.  This just guarantees that there is a
1136  * journal write.
1137  */
1138 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
1139                                 struct btree_iter *iter)
1140 {
1141         struct bch_fs *c = as->c;
1142         struct btree *old;
1143
1144         trace_btree_set_root(c, b);
1145         BUG_ON(!b->written);
1146
1147         old = btree_node_root(c, b);
1148
1149         /*
1150          * Ensure no one is using the old root while we switch to the
1151          * new root:
1152          */
1153         bch2_btree_node_lock_write(old, iter);
1154
1155         bch2_btree_set_root_inmem(as, b);
1156
1157         btree_update_updated_root(as);
1158
1159         /*
1160          * Unlock old root after new root is visible:
1161          *
1162          * The new root isn't persistent, but that's ok: we still have
1163          * an intent lock on the new root, and any updates that would
1164          * depend on the new root would have to update the new root.
1165          */
1166         bch2_btree_node_unlock_write(old, iter);
1167 }
1168
1169 /* Interior node updates: */
1170
1171 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1172                                         struct btree_iter *iter,
1173                                         struct bkey_i *insert,
1174                                         struct btree_node_iter *node_iter)
1175 {
1176         struct bch_fs *c = as->c;
1177         struct bch_fs_usage stats = { 0 };
1178         struct bkey_packed *k;
1179         struct bkey tmp;
1180
1181         BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
1182
1183         if (bkey_extent_is_data(&insert->k))
1184                 bch2_mark_key(c, bkey_i_to_s_c(insert),
1185                              c->opts.btree_node_size, true,
1186                              gc_pos_btree_node(b), &stats, 0, 0);
1187
1188         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1189                !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
1190                 bch2_btree_node_iter_advance(node_iter, b);
1191
1192         /*
1193          * If we're overwriting, look up pending delete and mark so that gc
1194          * marks it on the pending delete list:
1195          */
1196         if (k && !bkey_cmp_packed(b, k, &insert->k))
1197                 bch2_btree_node_free_index(as, b,
1198                                            bkey_disassemble(b, k, &tmp),
1199                                            &stats);
1200
1201         bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1202                             gc_pos_btree_node(b));
1203
1204         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1205         set_btree_node_dirty(b);
1206         set_btree_node_need_write(b);
1207 }
1208
1209 /*
1210  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1211  * node)
1212  */
1213 static struct btree *__btree_split_node(struct btree_update *as,
1214                                         struct btree *n1,
1215                                         struct btree_iter *iter)
1216 {
1217         size_t nr_packed = 0, nr_unpacked = 0;
1218         struct btree *n2;
1219         struct bset *set1, *set2;
1220         struct bkey_packed *k, *prev = NULL;
1221
1222         n2 = bch2_btree_node_alloc(as, n1->level);
1223
1224         n2->data->max_key       = n1->data->max_key;
1225         n2->data->format        = n1->format;
1226         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1227         n2->key.k.p = n1->key.k.p;
1228
1229         btree_node_set_format(n2, n2->data->format);
1230
1231         set1 = btree_bset_first(n1);
1232         set2 = btree_bset_first(n2);
1233
1234         /*
1235          * Has to be a linear search because we don't have an auxiliary
1236          * search tree yet
1237          */
1238         k = set1->start;
1239         while (1) {
1240                 if (bkey_next(k) == vstruct_last(set1))
1241                         break;
1242                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1243                         break;
1244
1245                 if (bkey_packed(k))
1246                         nr_packed++;
1247                 else
1248                         nr_unpacked++;
1249
1250                 prev = k;
1251                 k = bkey_next(k);
1252         }
1253
1254         BUG_ON(!prev);
1255
1256         n1->key.k.p = bkey_unpack_pos(n1, prev);
1257         n1->data->max_key = n1->key.k.p;
1258         n2->data->min_key =
1259                 btree_type_successor(n1->btree_id, n1->key.k.p);
1260
1261         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1262         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1263
1264         set_btree_bset_end(n1, n1->set);
1265         set_btree_bset_end(n2, n2->set);
1266
1267         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1268         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1269         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1270         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1271
1272         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1273         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1274         n1->nr.packed_keys      = nr_packed;
1275         n1->nr.unpacked_keys    = nr_unpacked;
1276
1277         BUG_ON(!set1->u64s);
1278         BUG_ON(!set2->u64s);
1279
1280         memcpy_u64s(set2->start,
1281                     vstruct_end(set1),
1282                     le16_to_cpu(set2->u64s));
1283
1284         btree_node_reset_sib_u64s(n1);
1285         btree_node_reset_sib_u64s(n2);
1286
1287         bch2_verify_btree_nr_keys(n1);
1288         bch2_verify_btree_nr_keys(n2);
1289
1290         if (n1->level) {
1291                 btree_node_interior_verify(n1);
1292                 btree_node_interior_verify(n2);
1293         }
1294
1295         return n2;
1296 }
1297
1298 /*
1299  * For updates to interior nodes, we've got to do the insert before we split
1300  * because the stuff we're inserting has to be inserted atomically. Post split,
1301  * the keys might have to go in different nodes and the split would no longer be
1302  * atomic.
1303  *
1304  * Worse, if the insert is from btree node coalescing, if we do the insert after
1305  * we do the split (and pick the pivot) - the pivot we pick might be between
1306  * nodes that were coalesced, and thus in the middle of a child node post
1307  * coalescing:
1308  */
1309 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1310                                     struct btree_iter *iter,
1311                                     struct keylist *keys)
1312 {
1313         struct btree_node_iter node_iter;
1314         struct bkey_i *k = bch2_keylist_front(keys);
1315         struct bkey_packed *p;
1316         struct bset *i;
1317
1318         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1319
1320         bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
1321
1322         while (!bch2_keylist_empty(keys)) {
1323                 k = bch2_keylist_front(keys);
1324
1325                 BUG_ON(bch_keylist_u64s(keys) >
1326                        bch_btree_keys_u64s_remaining(as->c, b));
1327                 BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
1328                 BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
1329
1330                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1331                 bch2_keylist_pop_front(keys);
1332         }
1333
1334         /*
1335          * We can't tolerate whiteouts here - with whiteouts there can be
1336          * duplicate keys, and it would be rather bad if we picked a duplicate
1337          * for the pivot:
1338          */
1339         i = btree_bset_first(b);
1340         p = i->start;
1341         while (p != vstruct_last(i))
1342                 if (bkey_deleted(p)) {
1343                         le16_add_cpu(&i->u64s, -p->u64s);
1344                         set_btree_bset_end(b, b->set);
1345                         memmove_u64s_down(p, bkey_next(p),
1346                                           (u64 *) vstruct_last(i) -
1347                                           (u64 *) p);
1348                 } else
1349                         p = bkey_next(p);
1350
1351         BUG_ON(b->nsets != 1 ||
1352                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1353
1354         btree_node_interior_verify(b);
1355 }
1356
1357 static void btree_split(struct btree_update *as, struct btree *b,
1358                         struct btree_iter *iter, struct keylist *keys,
1359                         unsigned flags)
1360 {
1361         struct bch_fs *c = as->c;
1362         struct btree *parent = btree_node_parent(iter, b);
1363         struct btree *n1, *n2 = NULL, *n3 = NULL;
1364         u64 start_time = local_clock();
1365
1366         BUG_ON(!parent && (b != btree_node_root(c, b)));
1367         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1368
1369         bch2_btree_interior_update_will_free_node(as, b);
1370
1371         n1 = bch2_btree_node_alloc_replacement(as, b);
1372
1373         if (keys)
1374                 btree_split_insert_keys(as, n1, iter, keys);
1375
1376         if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
1377                 trace_btree_split(c, b);
1378
1379                 n2 = __btree_split_node(as, n1, iter);
1380
1381                 bch2_btree_build_aux_trees(n2);
1382                 bch2_btree_build_aux_trees(n1);
1383                 six_unlock_write(&n2->lock);
1384                 six_unlock_write(&n1->lock);
1385
1386                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1387
1388                 /*
1389                  * Note that on recursive parent_keys == keys, so we
1390                  * can't start adding new keys to parent_keys before emptying it
1391                  * out (which we did with btree_split_insert_keys() above)
1392                  */
1393                 bch2_keylist_add(&as->parent_keys, &n1->key);
1394                 bch2_keylist_add(&as->parent_keys, &n2->key);
1395
1396                 if (!parent) {
1397                         /* Depth increases, make a new root */
1398                         n3 = __btree_root_alloc(as, b->level + 1);
1399
1400                         n3->sib_u64s[0] = U16_MAX;
1401                         n3->sib_u64s[1] = U16_MAX;
1402
1403                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1404
1405                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1406                 }
1407         } else {
1408                 trace_btree_compact(c, b);
1409
1410                 bch2_btree_build_aux_trees(n1);
1411                 six_unlock_write(&n1->lock);
1412
1413                 bch2_keylist_add(&as->parent_keys, &n1->key);
1414         }
1415
1416         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1417
1418         /* New nodes all written, now make them visible: */
1419
1420         if (parent) {
1421                 /* Split a non root node */
1422                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1423         } else if (n3) {
1424                 bch2_btree_set_root(as, n3, iter);
1425         } else {
1426                 /* Root filled up but didn't need to be split */
1427                 bch2_btree_set_root(as, n1, iter);
1428         }
1429
1430         bch2_btree_open_bucket_put(c, n1);
1431         if (n2)
1432                 bch2_btree_open_bucket_put(c, n2);
1433         if (n3)
1434                 bch2_btree_open_bucket_put(c, n3);
1435
1436         /*
1437          * Note - at this point other linked iterators could still have @b read
1438          * locked; we're depending on the bch2_btree_iter_node_replace() calls
1439          * below removing all references to @b so we don't return with other
1440          * iterators pointing to a node they have locked that's been freed.
1441          *
1442          * We have to free the node first because the bch2_iter_node_replace()
1443          * calls will drop _our_ iterator's reference - and intent lock - to @b.
1444          */
1445         bch2_btree_node_free_inmem(c, b, iter);
1446
1447         /* Successful split, update the iterator to point to the new nodes: */
1448
1449         if (n3)
1450                 bch2_btree_iter_node_replace(iter, n3);
1451         if (n2)
1452                 bch2_btree_iter_node_replace(iter, n2);
1453         bch2_btree_iter_node_replace(iter, n1);
1454
1455         bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
1456 }
1457
1458 static void
1459 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1460                                 struct btree_iter *iter, struct keylist *keys)
1461 {
1462         struct btree_iter *linked;
1463         struct btree_node_iter node_iter;
1464         struct bkey_i *insert = bch2_keylist_front(keys);
1465         struct bkey_packed *k;
1466
1467         /* Don't screw up @iter's position: */
1468         node_iter = iter->l[b->level].iter;
1469
1470         /*
1471          * btree_split(), btree_gc_coalesce() will insert keys before
1472          * the iterator's current position - they know the keys go in
1473          * the node the iterator points to:
1474          */
1475         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1476                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1477                 ;
1478
1479         while (!bch2_keylist_empty(keys)) {
1480                 insert = bch2_keylist_front(keys);
1481
1482                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1483                 bch2_keylist_pop_front(keys);
1484         }
1485
1486         btree_update_updated_node(as, b);
1487
1488         for_each_btree_iter_with_node(iter, b, linked)
1489                 bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
1490
1491         bch2_btree_iter_verify(iter, b);
1492 }
1493
1494 /**
1495  * bch_btree_insert_node - insert bkeys into a given btree node
1496  *
1497  * @iter:               btree iterator
1498  * @keys:               list of keys to insert
1499  * @hook:               insert callback
1500  * @persistent:         if not null, @persistent will wait on journal write
1501  *
1502  * Inserts as many keys as it can into a given btree node, splitting it if full.
1503  * If a split occurred, this function will return early. This can only happen
1504  * for leaf nodes -- inserts into interior nodes have to be atomic.
1505  */
1506 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1507                             struct btree_iter *iter, struct keylist *keys,
1508                             unsigned flags)
1509 {
1510         struct bch_fs *c = as->c;
1511         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1512         int old_live_u64s = b->nr.live_u64s;
1513         int live_u64s_added, u64s_added;
1514
1515         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1516         BUG_ON(!b->level);
1517         BUG_ON(!as || as->b);
1518         bch2_verify_keylist_sorted(keys);
1519
1520         if (as->must_rewrite)
1521                 goto split;
1522
1523         bch2_btree_node_lock_for_insert(c, b, iter);
1524
1525         if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
1526                 bch2_btree_node_unlock_write(b, iter);
1527                 goto split;
1528         }
1529
1530         bch2_btree_insert_keys_interior(as, b, iter, keys);
1531
1532         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1533         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1534
1535         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1536                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1537         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1538                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1539
1540         if (u64s_added > live_u64s_added &&
1541             bch2_maybe_compact_whiteouts(c, b))
1542                 bch2_btree_iter_reinit_node(iter, b);
1543
1544         bch2_btree_node_unlock_write(b, iter);
1545
1546         btree_node_interior_verify(b);
1547
1548         bch2_foreground_maybe_merge(c, iter, b->level, flags);
1549         return;
1550 split:
1551         btree_split(as, b, iter, keys, flags);
1552 }
1553
1554 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1555                           unsigned flags)
1556 {
1557         struct btree *b = iter->l[0].b;
1558         struct btree_update *as;
1559         struct closure cl;
1560         int ret = 0;
1561         struct btree_iter *linked;
1562
1563         /*
1564          * We already have a disk reservation and open buckets pinned; this
1565          * allocation must not block:
1566          */
1567         for_each_btree_iter(iter, linked)
1568                 if (linked->btree_id == BTREE_ID_EXTENTS)
1569                         flags |= BTREE_INSERT_USE_RESERVE;
1570
1571         closure_init_stack(&cl);
1572
1573         /* Hack, because gc and splitting nodes doesn't mix yet: */
1574         if (!down_read_trylock(&c->gc_lock)) {
1575                 if (flags & BTREE_INSERT_NOUNLOCK)
1576                         return -EINTR;
1577
1578                 bch2_btree_iter_unlock(iter);
1579                 down_read(&c->gc_lock);
1580
1581                 if (btree_iter_linked(iter))
1582                         ret = -EINTR;
1583         }
1584
1585         /*
1586          * XXX: figure out how far we might need to split,
1587          * instead of locking/reserving all the way to the root:
1588          */
1589         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1590                 ret = -EINTR;
1591                 goto out;
1592         }
1593
1594         as = bch2_btree_update_start(c, iter->btree_id,
1595                 btree_update_reserve_required(c, b), flags,
1596                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1597         if (IS_ERR(as)) {
1598                 ret = PTR_ERR(as);
1599                 if (ret == -EAGAIN) {
1600                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1601                         bch2_btree_iter_unlock(iter);
1602                         ret = -EINTR;
1603                 }
1604                 goto out;
1605         }
1606
1607         btree_split(as, b, iter, NULL, flags);
1608         bch2_btree_update_done(as);
1609
1610         /*
1611          * We haven't successfully inserted yet, so don't downgrade all the way
1612          * back to read locks;
1613          */
1614         __bch2_btree_iter_downgrade(iter, 1);
1615 out:
1616         up_read(&c->gc_lock);
1617         closure_sync(&cl);
1618         return ret;
1619 }
1620
1621 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1622                                    struct btree_iter *iter,
1623                                    unsigned level,
1624                                    unsigned flags,
1625                                    enum btree_node_sibling sib)
1626 {
1627         struct btree_update *as;
1628         struct bkey_format_state new_s;
1629         struct bkey_format new_f;
1630         struct bkey_i delete;
1631         struct btree *b, *m, *n, *prev, *next, *parent;
1632         struct closure cl;
1633         size_t sib_u64s;
1634         int ret = 0;
1635
1636         closure_init_stack(&cl);
1637 retry:
1638         BUG_ON(!btree_node_locked(iter, level));
1639
1640         b = iter->l[level].b;
1641
1642         parent = btree_node_parent(iter, b);
1643         if (!parent)
1644                 goto out;
1645
1646         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1647                 goto out;
1648
1649         /* XXX: can't be holding read locks */
1650         m = bch2_btree_node_get_sibling(c, iter, b,
1651                         !(flags & BTREE_INSERT_NOUNLOCK), sib);
1652         if (IS_ERR(m)) {
1653                 ret = PTR_ERR(m);
1654                 goto err;
1655         }
1656
1657         /* NULL means no sibling: */
1658         if (!m) {
1659                 b->sib_u64s[sib] = U16_MAX;
1660                 goto out;
1661         }
1662
1663         if (sib == btree_prev_sib) {
1664                 prev = m;
1665                 next = b;
1666         } else {
1667                 prev = b;
1668                 next = m;
1669         }
1670
1671         bch2_bkey_format_init(&new_s);
1672         __bch2_btree_calc_format(&new_s, b);
1673         __bch2_btree_calc_format(&new_s, m);
1674         new_f = bch2_bkey_format_done(&new_s);
1675
1676         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1677                 btree_node_u64s_with_format(m, &new_f);
1678
1679         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1680                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1681                 sib_u64s /= 2;
1682                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1683         }
1684
1685         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1686         b->sib_u64s[sib] = sib_u64s;
1687
1688         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1689                 six_unlock_intent(&m->lock);
1690                 goto out;
1691         }
1692
1693         /* We're changing btree topology, doesn't mix with gc: */
1694         if (!down_read_trylock(&c->gc_lock))
1695                 goto err_cycle_gc_lock;
1696
1697         if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
1698                 ret = -EINTR;
1699                 goto err_unlock;
1700         }
1701
1702         as = bch2_btree_update_start(c, iter->btree_id,
1703                          btree_update_reserve_required(c, parent) + 1,
1704                          BTREE_INSERT_NOFAIL|
1705                          BTREE_INSERT_USE_RESERVE,
1706                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1707         if (IS_ERR(as)) {
1708                 ret = PTR_ERR(as);
1709                 goto err_unlock;
1710         }
1711
1712         trace_btree_merge(c, b);
1713
1714         bch2_btree_interior_update_will_free_node(as, b);
1715         bch2_btree_interior_update_will_free_node(as, m);
1716
1717         n = bch2_btree_node_alloc(as, b->level);
1718
1719         n->data->min_key        = prev->data->min_key;
1720         n->data->max_key        = next->data->max_key;
1721         n->data->format         = new_f;
1722         n->key.k.p              = next->key.k.p;
1723
1724         btree_node_set_format(n, new_f);
1725
1726         bch2_btree_sort_into(c, n, prev);
1727         bch2_btree_sort_into(c, n, next);
1728
1729         bch2_btree_build_aux_trees(n);
1730         six_unlock_write(&n->lock);
1731
1732         bkey_init(&delete.k);
1733         delete.k.p = prev->key.k.p;
1734         bch2_keylist_add(&as->parent_keys, &delete);
1735         bch2_keylist_add(&as->parent_keys, &n->key);
1736
1737         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1738
1739         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1740
1741         bch2_btree_open_bucket_put(c, n);
1742         bch2_btree_node_free_inmem(c, b, iter);
1743         bch2_btree_node_free_inmem(c, m, iter);
1744         bch2_btree_iter_node_replace(iter, n);
1745
1746         bch2_btree_iter_verify(iter, n);
1747
1748         bch2_btree_update_done(as);
1749
1750         six_unlock_intent(&m->lock);
1751         up_read(&c->gc_lock);
1752 out:
1753         /*
1754          * Don't downgrade locks here: we're called after successful insert,
1755          * and the caller will downgrade locks after a successful insert
1756          * anyways (in case e.g. a split was required first)
1757          *
1758          * And we're also called when inserting into interior nodes in the
1759          * split path, and downgrading to read locks in there is potentially
1760          * confusing:
1761          */
1762         closure_sync(&cl);
1763         return;
1764
1765 err_cycle_gc_lock:
1766         six_unlock_intent(&m->lock);
1767
1768         if (flags & BTREE_INSERT_NOUNLOCK)
1769                 goto out;
1770
1771         bch2_btree_iter_unlock(iter);
1772
1773         down_read(&c->gc_lock);
1774         up_read(&c->gc_lock);
1775         ret = -EINTR;
1776         goto err;
1777
1778 err_unlock:
1779         six_unlock_intent(&m->lock);
1780         up_read(&c->gc_lock);
1781 err:
1782         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1783
1784         if ((ret == -EAGAIN || ret == -EINTR) &&
1785             !(flags & BTREE_INSERT_NOUNLOCK)) {
1786                 bch2_btree_iter_unlock(iter);
1787                 closure_sync(&cl);
1788                 ret = bch2_btree_iter_traverse(iter);
1789                 if (ret)
1790                         goto out;
1791
1792                 goto retry;
1793         }
1794
1795         goto out;
1796 }
1797
1798 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1799                                 struct btree *b, unsigned flags,
1800                                 struct closure *cl)
1801 {
1802         struct btree *n, *parent = btree_node_parent(iter, b);
1803         struct btree_update *as;
1804
1805         as = bch2_btree_update_start(c, iter->btree_id,
1806                 (parent
1807                  ? btree_update_reserve_required(c, parent)
1808                  : 0) + 1,
1809                 flags, cl);
1810         if (IS_ERR(as)) {
1811                 trace_btree_gc_rewrite_node_fail(c, b);
1812                 return PTR_ERR(as);
1813         }
1814
1815         bch2_btree_interior_update_will_free_node(as, b);
1816
1817         n = bch2_btree_node_alloc_replacement(as, b);
1818
1819         bch2_btree_build_aux_trees(n);
1820         six_unlock_write(&n->lock);
1821
1822         trace_btree_gc_rewrite_node(c, b);
1823
1824         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1825
1826         if (parent) {
1827                 bch2_keylist_add(&as->parent_keys, &n->key);
1828                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1829         } else {
1830                 bch2_btree_set_root(as, n, iter);
1831         }
1832
1833         bch2_btree_open_bucket_put(c, n);
1834
1835         bch2_btree_node_free_inmem(c, b, iter);
1836
1837         bch2_btree_iter_node_replace(iter, n);
1838
1839         bch2_btree_update_done(as);
1840         return 0;
1841 }
1842
1843 /**
1844  * bch_btree_node_rewrite - Rewrite/move a btree node
1845  *
1846  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1847  * btree_check_reserve() has to wait)
1848  */
1849 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1850                             __le64 seq, unsigned flags)
1851 {
1852         struct closure cl;
1853         struct btree *b;
1854         int ret;
1855
1856         flags |= BTREE_INSERT_NOFAIL;
1857
1858         closure_init_stack(&cl);
1859
1860         bch2_btree_iter_upgrade(iter, U8_MAX);
1861
1862         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1863                 if (!down_read_trylock(&c->gc_lock)) {
1864                         bch2_btree_iter_unlock(iter);
1865                         down_read(&c->gc_lock);
1866                 }
1867         }
1868
1869         while (1) {
1870                 ret = bch2_btree_iter_traverse(iter);
1871                 if (ret)
1872                         break;
1873
1874                 b = bch2_btree_iter_peek_node(iter);
1875                 if (!b || b->data->keys.seq != seq)
1876                         break;
1877
1878                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1879                 if (ret != -EAGAIN &&
1880                     ret != -EINTR)
1881                         break;
1882
1883                 bch2_btree_iter_unlock(iter);
1884                 closure_sync(&cl);
1885         }
1886
1887         bch2_btree_iter_downgrade(iter);
1888
1889         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1890                 up_read(&c->gc_lock);
1891
1892         closure_sync(&cl);
1893         return ret;
1894 }
1895
1896 static void __bch2_btree_node_update_key(struct bch_fs *c,
1897                                          struct btree_update *as,
1898                                          struct btree_iter *iter,
1899                                          struct btree *b, struct btree *new_hash,
1900                                          struct bkey_i_extent *new_key)
1901 {
1902         struct btree *parent;
1903         int ret;
1904
1905         /*
1906          * Two corner cases that need to be thought about here:
1907          *
1908          * @b may not be reachable yet - there might be another interior update
1909          * operation waiting on @b to be written, and we're gonna deliver the
1910          * write completion to that interior update operation _before_
1911          * persisting the new_key update
1912          *
1913          * That ends up working without us having to do anything special here:
1914          * the reason is, we do kick off (and do the in memory updates) for the
1915          * update for @new_key before we return, creating a new interior_update
1916          * operation here.
1917          *
1918          * The new interior update operation here will in effect override the
1919          * previous one. The previous one was going to terminate - make @b
1920          * reachable - in one of two ways:
1921          * - updating the btree root pointer
1922          *   In that case,
1923          *   no, this doesn't work. argh.
1924          */
1925
1926         if (b->will_make_reachable)
1927                 as->must_rewrite = true;
1928
1929         btree_interior_update_add_node_reference(as, b);
1930
1931         parent = btree_node_parent(iter, b);
1932         if (parent) {
1933                 if (new_hash) {
1934                         bkey_copy(&new_hash->key, &new_key->k_i);
1935                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
1936                                         new_hash, b->level, b->btree_id);
1937                         BUG_ON(ret);
1938                 }
1939
1940                 bch2_keylist_add(&as->parent_keys, &new_key->k_i);
1941                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
1942
1943                 if (new_hash) {
1944                         mutex_lock(&c->btree_cache.lock);
1945                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
1946
1947                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1948
1949                         bkey_copy(&b->key, &new_key->k_i);
1950                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1951                         BUG_ON(ret);
1952                         mutex_unlock(&c->btree_cache.lock);
1953                 } else {
1954                         bkey_copy(&b->key, &new_key->k_i);
1955                 }
1956         } else {
1957                 struct bch_fs_usage stats = { 0 };
1958
1959                 BUG_ON(btree_node_root(c, b) != b);
1960
1961                 bch2_btree_node_lock_write(b, iter);
1962
1963                 bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
1964                               c->opts.btree_node_size, true,
1965                               gc_pos_btree_root(b->btree_id),
1966                               &stats, 0, 0);
1967                 bch2_btree_node_free_index(as, NULL,
1968                                            bkey_i_to_s_c(&b->key),
1969                                            &stats);
1970                 bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1971                                     gc_pos_btree_root(b->btree_id));
1972
1973                 if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
1974                         mutex_lock(&c->btree_cache.lock);
1975                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1976
1977                         bkey_copy(&b->key, &new_key->k_i);
1978                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1979                         BUG_ON(ret);
1980                         mutex_unlock(&c->btree_cache.lock);
1981                 } else {
1982                         bkey_copy(&b->key, &new_key->k_i);
1983                 }
1984
1985                 btree_update_updated_root(as);
1986                 bch2_btree_node_unlock_write(b, iter);
1987         }
1988
1989         bch2_btree_update_done(as);
1990 }
1991
1992 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
1993                                struct btree *b, struct bkey_i_extent *new_key)
1994 {
1995         struct btree *parent = btree_node_parent(iter, b);
1996         struct btree_update *as = NULL;
1997         struct btree *new_hash = NULL;
1998         struct closure cl;
1999         int ret;
2000
2001         closure_init_stack(&cl);
2002
2003         if (!bch2_btree_iter_upgrade(iter, U8_MAX))
2004                 return -EINTR;
2005
2006         if (!down_read_trylock(&c->gc_lock)) {
2007                 bch2_btree_iter_unlock(iter);
2008                 down_read(&c->gc_lock);
2009
2010                 if (!bch2_btree_iter_relock(iter)) {
2011                         ret = -EINTR;
2012                         goto err;
2013                 }
2014         }
2015
2016         /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
2017         if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
2018                 /* bch2_btree_reserve_get will unlock */
2019                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2020                 if (ret) {
2021                         ret = -EINTR;
2022
2023                         bch2_btree_iter_unlock(iter);
2024                         up_read(&c->gc_lock);
2025                         closure_sync(&cl);
2026                         down_read(&c->gc_lock);
2027
2028                         if (!bch2_btree_iter_relock(iter))
2029                                 goto err;
2030                 }
2031
2032                 new_hash = bch2_btree_node_mem_alloc(c);
2033         }
2034
2035         as = bch2_btree_update_start(c, iter->btree_id,
2036                 parent ? btree_update_reserve_required(c, parent) : 0,
2037                 BTREE_INSERT_NOFAIL|
2038                 BTREE_INSERT_USE_RESERVE|
2039                 BTREE_INSERT_USE_ALLOC_RESERVE,
2040                 &cl);
2041
2042         if (IS_ERR(as)) {
2043                 ret = PTR_ERR(as);
2044                 if (ret == -EAGAIN)
2045                         ret = -EINTR;
2046
2047                 if (ret != -EINTR)
2048                         goto err;
2049
2050                 bch2_btree_iter_unlock(iter);
2051                 up_read(&c->gc_lock);
2052                 closure_sync(&cl);
2053                 down_read(&c->gc_lock);
2054
2055                 if (!bch2_btree_iter_relock(iter))
2056                         goto err;
2057         }
2058
2059         ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
2060                                       extent_i_to_s_c(new_key).s_c);
2061         if (ret)
2062                 goto err_free_update;
2063
2064         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
2065
2066         bch2_btree_iter_downgrade(iter);
2067 err:
2068         if (new_hash) {
2069                 mutex_lock(&c->btree_cache.lock);
2070                 list_move(&new_hash->list, &c->btree_cache.freeable);
2071                 mutex_unlock(&c->btree_cache.lock);
2072
2073                 six_unlock_write(&new_hash->lock);
2074                 six_unlock_intent(&new_hash->lock);
2075         }
2076         up_read(&c->gc_lock);
2077         closure_sync(&cl);
2078         return ret;
2079 err_free_update:
2080         bch2_btree_update_free(as);
2081         goto err;
2082 }
2083
2084 /* Init code: */
2085
2086 /*
2087  * Only for filesystem bringup, when first reading the btree roots or allocating
2088  * btree roots when initializing a new filesystem:
2089  */
2090 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2091 {
2092         BUG_ON(btree_node_root(c, b));
2093
2094         __bch2_btree_set_root_inmem(c, b);
2095         bch2_btree_set_root_ondisk(c, b, READ);
2096 }
2097
2098 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
2099 {
2100         struct closure cl;
2101         struct btree *b;
2102         int ret;
2103
2104         closure_init_stack(&cl);
2105
2106         do {
2107                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2108                 closure_sync(&cl);
2109         } while (ret);
2110
2111         b = bch2_btree_node_mem_alloc(c);
2112         bch2_btree_cache_cannibalize_unlock(c);
2113
2114         set_btree_node_fake(b);
2115         b->level        = 0;
2116         b->btree_id     = id;
2117
2118         bkey_extent_init(&b->key);
2119         b->key.k.p = POS_MAX;
2120         bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
2121
2122         bch2_bset_init_first(b, &b->data->keys);
2123         bch2_btree_build_aux_trees(b);
2124
2125         b->data->min_key = POS_MIN;
2126         b->data->max_key = POS_MAX;
2127         b->data->format = bch2_btree_calc_format(b);
2128         btree_node_set_format(b, b->data->format);
2129
2130         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
2131         BUG_ON(ret);
2132
2133         __bch2_btree_set_root_inmem(c, b);
2134
2135         six_unlock_write(&b->lock);
2136         six_unlock_intent(&b->lock);
2137 }
2138
2139 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
2140 {
2141         char *out = buf, *end = buf + PAGE_SIZE;
2142         struct btree_update *as;
2143
2144         mutex_lock(&c->btree_interior_update_lock);
2145         list_for_each_entry(as, &c->btree_interior_update_list, list)
2146                 out += scnprintf(out, end - out, "%p m %u w %u r %u j %llu\n",
2147                                  as,
2148                                  as->mode,
2149                                  as->nodes_written,
2150                                  atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
2151                                  bch2_journal_pin_seq(&c->journal, &as->journal));
2152         mutex_unlock(&c->btree_interior_update_lock);
2153
2154         return out - buf;
2155 }
2156
2157 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
2158 {
2159         size_t ret = 0;
2160         struct list_head *i;
2161
2162         mutex_lock(&c->btree_interior_update_lock);
2163         list_for_each(i, &c->btree_interior_update_list)
2164                 ret++;
2165         mutex_unlock(&c->btree_interior_update_lock);
2166
2167         return ret;
2168 }