]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update_interior.c
4d34bdca317fd5ab542e43e58d0732fae334e729
[bcachefs-tools-debian] / libbcachefs / btree_update_interior.c
1
2 #include "bcachefs.h"
3 #include "alloc_foreground.h"
4 #include "bkey_methods.h"
5 #include "btree_cache.h"
6 #include "btree_gc.h"
7 #include "btree_update.h"
8 #include "btree_update_interior.h"
9 #include "btree_io.h"
10 #include "btree_iter.h"
11 #include "btree_locking.h"
12 #include "buckets.h"
13 #include "extents.h"
14 #include "journal.h"
15 #include "journal_reclaim.h"
16 #include "keylist.h"
17 #include "replicas.h"
18 #include "super-io.h"
19
20 #include <linux/random.h>
21 #include <trace/events/bcachefs.h>
22
23 static void btree_node_will_make_reachable(struct btree_update *,
24                                            struct btree *);
25 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
26 static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
27
28 /* Debug code: */
29
30 static void btree_node_interior_verify(struct btree *b)
31 {
32         struct btree_node_iter iter;
33         struct bkey_packed *k;
34
35         BUG_ON(!b->level);
36
37         bch2_btree_node_iter_init(&iter, b, &b->key.k.p);
38 #if 1
39         BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
40                bkey_cmp_left_packed(b, k, &b->key.k.p));
41
42         BUG_ON((bch2_btree_node_iter_advance(&iter, b),
43                 !bch2_btree_node_iter_end(&iter)));
44 #else
45         const char *msg;
46
47         msg = "not found";
48         k = bch2_btree_node_iter_peek(&iter, b);
49         if (!k)
50                 goto err;
51
52         msg = "isn't what it should be";
53         if (bkey_cmp_left_packed(b, k, &b->key.k.p))
54                 goto err;
55
56         bch2_btree_node_iter_advance(&iter, b);
57
58         msg = "isn't last key";
59         if (!bch2_btree_node_iter_end(&iter))
60                 goto err;
61         return;
62 err:
63         bch2_dump_btree_node(b);
64         printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
65                b->key.k.p.offset, msg);
66         BUG();
67 #endif
68 }
69
70 /* Calculate ideal packed bkey format for new btree nodes: */
71
72 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
73 {
74         struct bkey_packed *k;
75         struct bset_tree *t;
76         struct bkey uk;
77
78         bch2_bkey_format_add_pos(s, b->data->min_key);
79
80         for_each_bset(b, t)
81                 for (k = btree_bkey_first(b, t);
82                      k != btree_bkey_last(b, t);
83                      k = bkey_next(k))
84                         if (!bkey_whiteout(k)) {
85                                 uk = bkey_unpack_key(b, k);
86                                 bch2_bkey_format_add_key(s, &uk);
87                         }
88 }
89
90 static struct bkey_format bch2_btree_calc_format(struct btree *b)
91 {
92         struct bkey_format_state s;
93
94         bch2_bkey_format_init(&s);
95         __bch2_btree_calc_format(&s, b);
96
97         return bch2_bkey_format_done(&s);
98 }
99
100 static size_t btree_node_u64s_with_format(struct btree *b,
101                                           struct bkey_format *new_f)
102 {
103         struct bkey_format *old_f = &b->format;
104
105         /* stupid integer promotion rules */
106         ssize_t delta =
107             (((int) new_f->key_u64s - old_f->key_u64s) *
108              (int) b->nr.packed_keys) +
109             (((int) new_f->key_u64s - BKEY_U64s) *
110              (int) b->nr.unpacked_keys);
111
112         BUG_ON(delta + b->nr.live_u64s < 0);
113
114         return b->nr.live_u64s + delta;
115 }
116
117 /**
118  * btree_node_format_fits - check if we could rewrite node with a new format
119  *
120  * This assumes all keys can pack with the new format -- it just checks if
121  * the re-packed keys would fit inside the node itself.
122  */
123 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
124                                  struct bkey_format *new_f)
125 {
126         size_t u64s = btree_node_u64s_with_format(b, new_f);
127
128         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
129 }
130
131 /* Btree node freeing/allocation: */
132
133 static bool btree_key_matches(struct bch_fs *c,
134                               struct bkey_s_c_extent l,
135                               struct bkey_s_c_extent r)
136 {
137         const struct bch_extent_ptr *ptr1, *ptr2;
138
139         extent_for_each_ptr(l, ptr1)
140                 extent_for_each_ptr(r, ptr2)
141                         if (ptr1->dev == ptr2->dev &&
142                             ptr1->gen == ptr2->gen &&
143                             ptr1->offset == ptr2->offset)
144                                 return true;
145
146         return false;
147 }
148
149 /*
150  * We're doing the index update that makes @b unreachable, update stuff to
151  * reflect that:
152  *
153  * Must be called _before_ btree_update_updated_root() or
154  * btree_update_updated_node:
155  */
156 static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
157                                        struct bkey_s_c k,
158                                        struct bch_fs_usage *stats)
159 {
160         struct bch_fs *c = as->c;
161         struct pending_btree_node_free *d;
162         unsigned replicas;
163
164         /*
165          * btree_update lock is only needed here to avoid racing with
166          * gc:
167          */
168         mutex_lock(&c->btree_interior_update_lock);
169
170         for (d = as->pending; d < as->pending + as->nr_pending; d++)
171                 if (!bkey_cmp(k.k->p, d->key.k.p) &&
172                     btree_key_matches(c, bkey_s_c_to_extent(k),
173                                       bkey_i_to_s_c_extent(&d->key)))
174                         goto found;
175         BUG();
176 found:
177         BUG_ON(d->index_update_done);
178         d->index_update_done = true;
179
180         /*
181          * Btree nodes are accounted as freed in bch_alloc_stats when they're
182          * freed from the index:
183          */
184         replicas = bch2_extent_nr_dirty_ptrs(k);
185         if (replicas)
186                 stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
187                         c->opts.btree_node_size * replicas;
188
189         /*
190          * We're dropping @k from the btree, but it's still live until the
191          * index update is persistent so we need to keep a reference around for
192          * mark and sweep to find - that's primarily what the
193          * btree_node_pending_free list is for.
194          *
195          * So here (when we set index_update_done = true), we're moving an
196          * existing reference to a different part of the larger "gc keyspace" -
197          * and the new position comes after the old position, since GC marks
198          * the pending free list after it walks the btree.
199          *
200          * If we move the reference while mark and sweep is _between_ the old
201          * and the new position, mark and sweep will see the reference twice
202          * and it'll get double accounted - so check for that here and subtract
203          * to cancel out one of mark and sweep's markings if necessary:
204          */
205
206         /*
207          * bch2_mark_key() compares the current gc pos to the pos we're
208          * moving this reference from, hence one comparison here:
209          */
210         if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
211                 struct bch_fs_usage tmp = { 0 };
212
213                 bch2_mark_key(c, BKEY_TYPE_BTREE,
214                               bkey_i_to_s_c(&d->key),
215                               false, 0, b
216                               ? gc_pos_btree_node(b)
217                               : gc_pos_btree_root(as->btree_id),
218                               &tmp, 0, 0);
219                 /*
220                  * Don't apply tmp - pending deletes aren't tracked in
221                  * bch_alloc_stats:
222                  */
223         }
224
225         mutex_unlock(&c->btree_interior_update_lock);
226 }
227
228 static void __btree_node_free(struct bch_fs *c, struct btree *b)
229 {
230         trace_btree_node_free(c, b);
231
232         BUG_ON(btree_node_dirty(b));
233         BUG_ON(btree_node_need_write(b));
234         BUG_ON(b == btree_node_root(c, b));
235         BUG_ON(b->ob.nr);
236         BUG_ON(!list_empty(&b->write_blocked));
237         BUG_ON(b->will_make_reachable);
238
239         clear_btree_node_noevict(b);
240
241         bch2_btree_node_hash_remove(&c->btree_cache, b);
242
243         mutex_lock(&c->btree_cache.lock);
244         list_move(&b->list, &c->btree_cache.freeable);
245         mutex_unlock(&c->btree_cache.lock);
246 }
247
248 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
249 {
250         struct open_buckets ob = b->ob;
251
252         btree_update_drop_new_node(c, b);
253
254         b->ob.nr = 0;
255
256         clear_btree_node_dirty(b);
257
258         btree_node_lock_type(c, b, SIX_LOCK_write);
259         __btree_node_free(c, b);
260         six_unlock_write(&b->lock);
261
262         bch2_open_buckets_put(c, &ob);
263 }
264
265 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
266                                 struct btree_iter *iter)
267 {
268         /*
269          * Is this a node that isn't reachable on disk yet?
270          *
271          * Nodes that aren't reachable yet have writes blocked until they're
272          * reachable - now that we've cancelled any pending writes and moved
273          * things waiting on that write to wait on this update, we can drop this
274          * node from the list of nodes that the other update is making
275          * reachable, prior to freeing it:
276          */
277         btree_update_drop_new_node(c, b);
278
279         __bch2_btree_node_lock_write(b, iter);
280         __btree_node_free(c, b);
281         six_unlock_write(&b->lock);
282
283         bch2_btree_iter_node_drop(iter, b);
284 }
285
286 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
287                                         struct pending_btree_node_free *pending)
288 {
289         struct bch_fs_usage stats = { 0 };
290
291         BUG_ON(!pending->index_update_done);
292
293         bch2_mark_key(c, BKEY_TYPE_BTREE,
294                       bkey_i_to_s_c(&pending->key),
295                       false, 0,
296                       gc_phase(GC_PHASE_PENDING_DELETE),
297                       &stats, 0, 0);
298         /*
299          * Don't apply stats - pending deletes aren't tracked in
300          * bch_alloc_stats:
301          */
302 }
303
304 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
305                                              struct disk_reservation *res,
306                                              struct closure *cl,
307                                              unsigned flags)
308 {
309         struct write_point *wp;
310         struct btree *b;
311         BKEY_PADDED(k) tmp;
312         struct bkey_i_extent *e;
313         struct open_buckets ob = { .nr = 0 };
314         struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
315         unsigned nr_reserve;
316         enum alloc_reserve alloc_reserve;
317
318         if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
319                 nr_reserve      = 0;
320                 alloc_reserve   = RESERVE_ALLOC;
321         } else if (flags & BTREE_INSERT_USE_RESERVE) {
322                 nr_reserve      = BTREE_NODE_RESERVE / 2;
323                 alloc_reserve   = RESERVE_BTREE;
324         } else {
325                 nr_reserve      = BTREE_NODE_RESERVE;
326                 alloc_reserve   = RESERVE_NONE;
327         }
328
329         mutex_lock(&c->btree_reserve_cache_lock);
330         if (c->btree_reserve_cache_nr > nr_reserve) {
331                 struct btree_alloc *a =
332                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
333
334                 ob = a->ob;
335                 bkey_copy(&tmp.k, &a->k);
336                 mutex_unlock(&c->btree_reserve_cache_lock);
337                 goto mem_alloc;
338         }
339         mutex_unlock(&c->btree_reserve_cache_lock);
340
341 retry:
342         wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
343                                       writepoint_ptr(&c->btree_write_point),
344                                       &devs_have,
345                                       res->nr_replicas,
346                                       c->opts.metadata_replicas_required,
347                                       alloc_reserve, 0, cl);
348         if (IS_ERR(wp))
349                 return ERR_CAST(wp);
350
351         if (wp->sectors_free < c->opts.btree_node_size) {
352                 struct open_bucket *ob;
353                 unsigned i;
354
355                 open_bucket_for_each(c, &wp->ptrs, ob, i)
356                         if (ob->sectors_free < c->opts.btree_node_size)
357                                 ob->sectors_free = 0;
358
359                 bch2_alloc_sectors_done(c, wp);
360                 goto retry;
361         }
362
363         e = bkey_extent_init(&tmp.k);
364         bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
365
366         bch2_open_bucket_get(c, wp, &ob);
367         bch2_alloc_sectors_done(c, wp);
368 mem_alloc:
369         b = bch2_btree_node_mem_alloc(c);
370
371         /* we hold cannibalize_lock: */
372         BUG_ON(IS_ERR(b));
373         BUG_ON(b->ob.nr);
374
375         bkey_copy(&b->key, &tmp.k);
376         b->ob = ob;
377
378         return b;
379 }
380
381 static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
382 {
383         struct bch_fs *c = as->c;
384         struct btree *b;
385
386         BUG_ON(level >= BTREE_MAX_DEPTH);
387         BUG_ON(!as->reserve->nr);
388
389         b = as->reserve->b[--as->reserve->nr];
390
391         BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
392
393         set_btree_node_accessed(b);
394         set_btree_node_dirty(b);
395
396         bch2_bset_init_first(b, &b->data->keys);
397         memset(&b->nr, 0, sizeof(b->nr));
398         b->data->magic = cpu_to_le64(bset_magic(c));
399         b->data->flags = 0;
400         SET_BTREE_NODE_ID(b->data, as->btree_id);
401         SET_BTREE_NODE_LEVEL(b->data, level);
402         b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
403
404         bch2_btree_build_aux_trees(b);
405
406         btree_node_will_make_reachable(as, b);
407
408         trace_btree_node_alloc(c, b);
409         return b;
410 }
411
412 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
413                                                   struct btree *b,
414                                                   struct bkey_format format)
415 {
416         struct btree *n;
417
418         n = bch2_btree_node_alloc(as, b->level);
419
420         n->data->min_key        = b->data->min_key;
421         n->data->max_key        = b->data->max_key;
422         n->data->format         = format;
423         SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
424
425         btree_node_set_format(n, format);
426
427         bch2_btree_sort_into(as->c, n, b);
428
429         btree_node_reset_sib_u64s(n);
430
431         n->key.k.p = b->key.k.p;
432         return n;
433 }
434
435 static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
436                                                        struct btree *b)
437 {
438         struct bkey_format new_f = bch2_btree_calc_format(b);
439
440         /*
441          * The keys might expand with the new format - if they wouldn't fit in
442          * the btree node anymore, use the old format for now:
443          */
444         if (!bch2_btree_node_format_fits(as->c, b, &new_f))
445                 new_f = b->format;
446
447         return __bch2_btree_node_alloc_replacement(as, b, new_f);
448 }
449
450 static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
451 {
452         struct btree *b = bch2_btree_node_alloc(as, level);
453
454         b->data->min_key = POS_MIN;
455         b->data->max_key = POS_MAX;
456         b->data->format = bch2_btree_calc_format(b);
457         b->key.k.p = POS_MAX;
458
459         btree_node_set_format(b, b->data->format);
460         bch2_btree_build_aux_trees(b);
461
462         six_unlock_write(&b->lock);
463
464         return b;
465 }
466
467 static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
468 {
469         bch2_disk_reservation_put(c, &reserve->disk_res);
470
471         mutex_lock(&c->btree_reserve_cache_lock);
472
473         while (reserve->nr) {
474                 struct btree *b = reserve->b[--reserve->nr];
475
476                 six_unlock_write(&b->lock);
477
478                 if (c->btree_reserve_cache_nr <
479                     ARRAY_SIZE(c->btree_reserve_cache)) {
480                         struct btree_alloc *a =
481                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
482
483                         a->ob = b->ob;
484                         b->ob.nr = 0;
485                         bkey_copy(&a->k, &b->key);
486                 } else {
487                         bch2_open_buckets_put(c, &b->ob);
488                 }
489
490                 btree_node_lock_type(c, b, SIX_LOCK_write);
491                 __btree_node_free(c, b);
492                 six_unlock_write(&b->lock);
493
494                 six_unlock_intent(&b->lock);
495         }
496
497         mutex_unlock(&c->btree_reserve_cache_lock);
498
499         mempool_free(reserve, &c->btree_reserve_pool);
500 }
501
502 static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
503                                                     unsigned nr_nodes,
504                                                     unsigned flags,
505                                                     struct closure *cl)
506 {
507         struct btree_reserve *reserve;
508         struct btree *b;
509         struct disk_reservation disk_res = { 0, 0 };
510         unsigned sectors = nr_nodes * c->opts.btree_node_size;
511         int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
512
513         if (flags & BTREE_INSERT_NOFAIL)
514                 disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
515
516         /*
517          * This check isn't necessary for correctness - it's just to potentially
518          * prevent us from doing a lot of work that'll end up being wasted:
519          */
520         ret = bch2_journal_error(&c->journal);
521         if (ret)
522                 return ERR_PTR(ret);
523
524         if (bch2_disk_reservation_get(c, &disk_res, sectors,
525                                       c->opts.metadata_replicas,
526                                       disk_res_flags))
527                 return ERR_PTR(-ENOSPC);
528
529         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
530
531         /*
532          * Protects reaping from the btree node cache and using the btree node
533          * open bucket reserve:
534          */
535         ret = bch2_btree_cache_cannibalize_lock(c, cl);
536         if (ret) {
537                 bch2_disk_reservation_put(c, &disk_res);
538                 return ERR_PTR(ret);
539         }
540
541         reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
542
543         reserve->disk_res = disk_res;
544         reserve->nr = 0;
545
546         while (reserve->nr < nr_nodes) {
547                 b = __bch2_btree_node_alloc(c, &disk_res,
548                                             flags & BTREE_INSERT_NOWAIT
549                                             ? NULL : cl, flags);
550                 if (IS_ERR(b)) {
551                         ret = PTR_ERR(b);
552                         goto err_free;
553                 }
554
555                 ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
556                                               bkey_i_to_s_c(&b->key));
557                 if (ret)
558                         goto err_free;
559
560                 reserve->b[reserve->nr++] = b;
561         }
562
563         bch2_btree_cache_cannibalize_unlock(c);
564         return reserve;
565 err_free:
566         bch2_btree_reserve_put(c, reserve);
567         bch2_btree_cache_cannibalize_unlock(c);
568         trace_btree_reserve_get_fail(c, nr_nodes, cl);
569         return ERR_PTR(ret);
570 }
571
572 /* Asynchronous interior node update machinery */
573
574 static void bch2_btree_update_free(struct btree_update *as)
575 {
576         struct bch_fs *c = as->c;
577
578         bch2_journal_pin_flush(&c->journal, &as->journal);
579
580         BUG_ON(as->nr_new_nodes);
581         BUG_ON(as->nr_pending);
582
583         if (as->reserve)
584                 bch2_btree_reserve_put(c, as->reserve);
585
586         mutex_lock(&c->btree_interior_update_lock);
587         list_del(&as->list);
588
589         closure_debug_destroy(&as->cl);
590         mempool_free(as, &c->btree_interior_update_pool);
591         percpu_ref_put(&c->writes);
592
593         closure_wake_up(&c->btree_interior_update_wait);
594         mutex_unlock(&c->btree_interior_update_lock);
595 }
596
597 static void btree_update_nodes_reachable(struct closure *cl)
598 {
599         struct btree_update *as = container_of(cl, struct btree_update, cl);
600         struct bch_fs *c = as->c;
601
602         bch2_journal_pin_drop(&c->journal, &as->journal);
603
604         mutex_lock(&c->btree_interior_update_lock);
605
606         while (as->nr_new_nodes) {
607                 struct btree *b = as->new_nodes[--as->nr_new_nodes];
608
609                 BUG_ON(b->will_make_reachable != (unsigned long) as);
610                 b->will_make_reachable = 0;
611                 mutex_unlock(&c->btree_interior_update_lock);
612
613                 /*
614                  * b->will_make_reachable prevented it from being written, so
615                  * write it now if it needs to be written:
616                  */
617                 btree_node_lock_type(c, b, SIX_LOCK_read);
618                 bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
619                 six_unlock_read(&b->lock);
620                 mutex_lock(&c->btree_interior_update_lock);
621         }
622
623         while (as->nr_pending)
624                 bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
625
626         mutex_unlock(&c->btree_interior_update_lock);
627
628         closure_wake_up(&as->wait);
629
630         bch2_btree_update_free(as);
631 }
632
633 static void btree_update_wait_on_journal(struct closure *cl)
634 {
635         struct btree_update *as = container_of(cl, struct btree_update, cl);
636         struct bch_fs *c = as->c;
637         int ret;
638
639         ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
640         if (ret < 0)
641                 goto err;
642         if (!ret) {
643                 continue_at(cl, btree_update_wait_on_journal, system_wq);
644                 return;
645         }
646
647         bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
648 err:
649         continue_at(cl, btree_update_nodes_reachable, system_wq);
650 }
651
652 static void btree_update_nodes_written(struct closure *cl)
653 {
654         struct btree_update *as = container_of(cl, struct btree_update, cl);
655         struct bch_fs *c = as->c;
656         struct btree *b;
657
658         /*
659          * We did an update to a parent node where the pointers we added pointed
660          * to child nodes that weren't written yet: now, the child nodes have
661          * been written so we can write out the update to the interior node.
662          */
663 retry:
664         mutex_lock(&c->btree_interior_update_lock);
665         as->nodes_written = true;
666
667         switch (as->mode) {
668         case BTREE_INTERIOR_NO_UPDATE:
669                 BUG();
670         case BTREE_INTERIOR_UPDATING_NODE:
671                 /* The usual case: */
672                 b = READ_ONCE(as->b);
673
674                 if (!six_trylock_read(&b->lock)) {
675                         mutex_unlock(&c->btree_interior_update_lock);
676                         btree_node_lock_type(c, b, SIX_LOCK_read);
677                         six_unlock_read(&b->lock);
678                         goto retry;
679                 }
680
681                 BUG_ON(!btree_node_dirty(b));
682                 closure_wait(&btree_current_write(b)->wait, cl);
683
684                 list_del(&as->write_blocked_list);
685                 mutex_unlock(&c->btree_interior_update_lock);
686
687                 /*
688                  * b->write_blocked prevented it from being written, so
689                  * write it now if it needs to be written:
690                  */
691                 bch2_btree_node_write_cond(c, b, true);
692                 six_unlock_read(&b->lock);
693                 break;
694
695         case BTREE_INTERIOR_UPDATING_AS:
696                 /*
697                  * The btree node we originally updated has been freed and is
698                  * being rewritten - so we need to write anything here, we just
699                  * need to signal to that btree_update that it's ok to make the
700                  * new replacement node visible:
701                  */
702                 closure_put(&as->parent_as->cl);
703
704                 /*
705                  * and then we have to wait on that btree_update to finish:
706                  */
707                 closure_wait(&as->parent_as->wait, cl);
708                 mutex_unlock(&c->btree_interior_update_lock);
709                 break;
710
711         case BTREE_INTERIOR_UPDATING_ROOT:
712                 /* b is the new btree root: */
713                 b = READ_ONCE(as->b);
714
715                 if (!six_trylock_read(&b->lock)) {
716                         mutex_unlock(&c->btree_interior_update_lock);
717                         btree_node_lock_type(c, b, SIX_LOCK_read);
718                         six_unlock_read(&b->lock);
719                         goto retry;
720                 }
721
722                 BUG_ON(c->btree_roots[b->btree_id].as != as);
723                 c->btree_roots[b->btree_id].as = NULL;
724
725                 bch2_btree_set_root_ondisk(c, b, WRITE);
726
727                 /*
728                  * We don't have to wait anything anything here (before
729                  * btree_update_nodes_reachable frees the old nodes
730                  * ondisk) - we've ensured that the very next journal write will
731                  * have the pointer to the new root, and before the allocator
732                  * can reuse the old nodes it'll have to do a journal commit:
733                  */
734                 six_unlock_read(&b->lock);
735                 mutex_unlock(&c->btree_interior_update_lock);
736
737                 /*
738                  * Bit of funny circularity going on here we have to break:
739                  *
740                  * We have to drop our journal pin before writing the journal
741                  * entry that points to the new btree root: else, we could
742                  * deadlock if the journal currently happens to be full.
743                  *
744                  * This mean we're dropping the journal pin _before_ the new
745                  * nodes are technically reachable - but this is safe, because
746                  * after the bch2_btree_set_root_ondisk() call above they will
747                  * be reachable as of the very next journal write:
748                  */
749                 bch2_journal_pin_drop(&c->journal, &as->journal);
750
751                 as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
752
753                 btree_update_wait_on_journal(cl);
754                 return;
755         }
756
757         continue_at(cl, btree_update_nodes_reachable, system_wq);
758 }
759
760 /*
761  * We're updating @b with pointers to nodes that haven't finished writing yet:
762  * block @b from being written until @as completes
763  */
764 static void btree_update_updated_node(struct btree_update *as, struct btree *b)
765 {
766         struct bch_fs *c = as->c;
767
768         mutex_lock(&c->btree_interior_update_lock);
769
770         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
771         BUG_ON(!btree_node_dirty(b));
772
773         as->mode = BTREE_INTERIOR_UPDATING_NODE;
774         as->b = b;
775         list_add(&as->write_blocked_list, &b->write_blocked);
776
777         mutex_unlock(&c->btree_interior_update_lock);
778
779         /*
780          * In general, when you're staging things in a journal that will later
781          * be written elsewhere, and you also want to guarantee ordering: that
782          * is, if you have updates a, b, c, after a crash you should never see c
783          * and not a or b - there's a problem:
784          *
785          * If the final destination of the update(s) (i.e. btree node) can be
786          * written/flushed _before_ the relevant journal entry - oops, that
787          * breaks ordering, since the various leaf nodes can be written in any
788          * order.
789          *
790          * Normally we use bset->journal_seq to deal with this - if during
791          * recovery we find a btree node write that's newer than the newest
792          * journal entry, we just ignore it - we don't need it, anything we're
793          * supposed to have (that we reported as completed via fsync()) will
794          * still be in the journal, and as far as the state of the journal is
795          * concerned that btree node write never happened.
796          *
797          * That breaks when we're rewriting/splitting/merging nodes, since we're
798          * mixing btree node writes that haven't happened yet with previously
799          * written data that has been reported as completed to the journal.
800          *
801          * Thus, before making the new nodes reachable, we have to wait the
802          * newest journal sequence number we have data for to be written (if it
803          * hasn't been yet).
804          */
805         bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
806 }
807
808 static void interior_update_flush(struct journal *j,
809                         struct journal_entry_pin *pin, u64 seq)
810 {
811         struct btree_update *as =
812                 container_of(pin, struct btree_update, journal);
813
814         bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
815 }
816
817 static void btree_update_reparent(struct btree_update *as,
818                                   struct btree_update *child)
819 {
820         struct bch_fs *c = as->c;
821
822         child->b = NULL;
823         child->mode = BTREE_INTERIOR_UPDATING_AS;
824         child->parent_as = as;
825         closure_get(&as->cl);
826
827         /*
828          * When we write a new btree root, we have to drop our journal pin
829          * _before_ the new nodes are technically reachable; see
830          * btree_update_nodes_written().
831          *
832          * This goes for journal pins that are recursively blocked on us - so,
833          * just transfer the journal pin to the new interior update so
834          * btree_update_nodes_written() can drop it.
835          */
836         bch2_journal_pin_add_if_older(&c->journal, &child->journal,
837                                       &as->journal, interior_update_flush);
838         bch2_journal_pin_drop(&c->journal, &child->journal);
839
840         as->journal_seq = max(as->journal_seq, child->journal_seq);
841 }
842
843 static void btree_update_updated_root(struct btree_update *as)
844 {
845         struct bch_fs *c = as->c;
846         struct btree_root *r = &c->btree_roots[as->btree_id];
847
848         mutex_lock(&c->btree_interior_update_lock);
849
850         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
851
852         /*
853          * Old root might not be persistent yet - if so, redirect its
854          * btree_update operation to point to us:
855          */
856         if (r->as)
857                 btree_update_reparent(as, r->as);
858
859         as->mode = BTREE_INTERIOR_UPDATING_ROOT;
860         as->b = r->b;
861         r->as = as;
862
863         mutex_unlock(&c->btree_interior_update_lock);
864
865         /*
866          * When we're rewriting nodes and updating interior nodes, there's an
867          * issue with updates that haven't been written in the journal getting
868          * mixed together with older data - see btree_update_updated_node()
869          * for the explanation.
870          *
871          * However, this doesn't affect us when we're writing a new btree root -
872          * because to make that new root reachable we have to write out a new
873          * journal entry, which must necessarily be newer than as->journal_seq.
874          */
875 }
876
877 static void btree_node_will_make_reachable(struct btree_update *as,
878                                            struct btree *b)
879 {
880         struct bch_fs *c = as->c;
881
882         mutex_lock(&c->btree_interior_update_lock);
883         BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
884         BUG_ON(b->will_make_reachable);
885
886         as->new_nodes[as->nr_new_nodes++] = b;
887         b->will_make_reachable = 1UL|(unsigned long) as;
888
889         closure_get(&as->cl);
890         mutex_unlock(&c->btree_interior_update_lock);
891 }
892
893 static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
894 {
895         struct btree_update *as;
896         unsigned long v;
897         unsigned i;
898
899         mutex_lock(&c->btree_interior_update_lock);
900         v = xchg(&b->will_make_reachable, 0);
901         as = (struct btree_update *) (v & ~1UL);
902
903         if (!as) {
904                 mutex_unlock(&c->btree_interior_update_lock);
905                 return;
906         }
907
908         for (i = 0; i < as->nr_new_nodes; i++)
909                 if (as->new_nodes[i] == b)
910                         goto found;
911
912         BUG();
913 found:
914         array_remove_item(as->new_nodes, as->nr_new_nodes, i);
915         mutex_unlock(&c->btree_interior_update_lock);
916
917         if (v & 1)
918                 closure_put(&as->cl);
919 }
920
921 static void btree_interior_update_add_node_reference(struct btree_update *as,
922                                                      struct btree *b)
923 {
924         struct bch_fs *c = as->c;
925         struct pending_btree_node_free *d;
926
927         mutex_lock(&c->btree_interior_update_lock);
928
929         /* Add this node to the list of nodes being freed: */
930         BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
931
932         d = &as->pending[as->nr_pending++];
933         d->index_update_done    = false;
934         d->seq                  = b->data->keys.seq;
935         d->btree_id             = b->btree_id;
936         d->level                = b->level;
937         bkey_copy(&d->key, &b->key);
938
939         mutex_unlock(&c->btree_interior_update_lock);
940 }
941
942 /*
943  * @b is being split/rewritten: it may have pointers to not-yet-written btree
944  * nodes and thus outstanding btree_updates - redirect @b's
945  * btree_updates to point to this btree_update:
946  */
947 void bch2_btree_interior_update_will_free_node(struct btree_update *as,
948                                                struct btree *b)
949 {
950         struct bch_fs *c = as->c;
951         struct closure *cl, *cl_n;
952         struct btree_update *p, *n;
953         struct btree_write *w;
954         struct bset_tree *t;
955
956         set_btree_node_dying(b);
957
958         if (btree_node_fake(b))
959                 return;
960
961         btree_interior_update_add_node_reference(as, b);
962
963         /*
964          * Does this node have data that hasn't been written in the journal?
965          *
966          * If so, we have to wait for the corresponding journal entry to be
967          * written before making the new nodes reachable - we can't just carry
968          * over the bset->journal_seq tracking, since we'll be mixing those keys
969          * in with keys that aren't in the journal anymore:
970          */
971         for_each_bset(b, t)
972                 as->journal_seq = max(as->journal_seq,
973                                       le64_to_cpu(bset(b, t)->journal_seq));
974
975         mutex_lock(&c->btree_interior_update_lock);
976
977         /*
978          * Does this node have any btree_update operations preventing
979          * it from being written?
980          *
981          * If so, redirect them to point to this btree_update: we can
982          * write out our new nodes, but we won't make them visible until those
983          * operations complete
984          */
985         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
986                 list_del(&p->write_blocked_list);
987                 btree_update_reparent(as, p);
988         }
989
990         clear_btree_node_dirty(b);
991         clear_btree_node_need_write(b);
992         w = btree_current_write(b);
993
994         /*
995          * Does this node have any btree_update operations waiting on this node
996          * to be written?
997          *
998          * If so, wake them up when this btree_update operation is reachable:
999          */
1000         llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
1001                 llist_add(&cl->list, &as->wait.list);
1002
1003         /*
1004          * Does this node have unwritten data that has a pin on the journal?
1005          *
1006          * If so, transfer that pin to the btree_update operation -
1007          * note that if we're freeing multiple nodes, we only need to keep the
1008          * oldest pin of any of the nodes we're freeing. We'll release the pin
1009          * when the new nodes are persistent and reachable on disk:
1010          */
1011         bch2_journal_pin_add_if_older(&c->journal, &w->journal,
1012                                       &as->journal, interior_update_flush);
1013         bch2_journal_pin_drop(&c->journal, &w->journal);
1014
1015         w = btree_prev_write(b);
1016         bch2_journal_pin_add_if_older(&c->journal, &w->journal,
1017                                       &as->journal, interior_update_flush);
1018         bch2_journal_pin_drop(&c->journal, &w->journal);
1019
1020         mutex_unlock(&c->btree_interior_update_lock);
1021 }
1022
1023 void bch2_btree_update_done(struct btree_update *as)
1024 {
1025         BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
1026
1027         bch2_btree_reserve_put(as->c, as->reserve);
1028         as->reserve = NULL;
1029
1030         continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq);
1031 }
1032
1033 struct btree_update *
1034 bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
1035                         unsigned nr_nodes, unsigned flags,
1036                         struct closure *cl)
1037 {
1038         struct btree_reserve *reserve;
1039         struct btree_update *as;
1040
1041         if (unlikely(!percpu_ref_tryget(&c->writes)))
1042                 return ERR_PTR(-EROFS);
1043
1044         reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
1045         if (IS_ERR(reserve)) {
1046                 percpu_ref_put(&c->writes);
1047                 return ERR_CAST(reserve);
1048         }
1049
1050         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
1051         memset(as, 0, sizeof(*as));
1052         closure_init(&as->cl, NULL);
1053         as->c           = c;
1054         as->mode        = BTREE_INTERIOR_NO_UPDATE;
1055         as->btree_id    = id;
1056         as->reserve     = reserve;
1057         INIT_LIST_HEAD(&as->write_blocked_list);
1058
1059         bch2_keylist_init(&as->parent_keys, as->inline_keys);
1060
1061         mutex_lock(&c->btree_interior_update_lock);
1062         list_add_tail(&as->list, &c->btree_interior_update_list);
1063         mutex_unlock(&c->btree_interior_update_lock);
1064
1065         return as;
1066 }
1067
1068 /* Btree root updates: */
1069
1070 static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1071 {
1072         /* Root nodes cannot be reaped */
1073         mutex_lock(&c->btree_cache.lock);
1074         list_del_init(&b->list);
1075         mutex_unlock(&c->btree_cache.lock);
1076
1077         mutex_lock(&c->btree_root_lock);
1078         BUG_ON(btree_node_root(c, b) &&
1079                (b->level < btree_node_root(c, b)->level ||
1080                 !btree_node_dying(btree_node_root(c, b))));
1081
1082         btree_node_root(c, b) = b;
1083         mutex_unlock(&c->btree_root_lock);
1084
1085         bch2_recalc_btree_reserve(c);
1086 }
1087
1088 static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
1089 {
1090         struct bch_fs *c = as->c;
1091         struct btree *old = btree_node_root(c, b);
1092         struct bch_fs_usage stats = { 0 };
1093
1094         __bch2_btree_set_root_inmem(c, b);
1095
1096         bch2_mark_key(c, BKEY_TYPE_BTREE,
1097                       bkey_i_to_s_c(&b->key),
1098                       true, 0,
1099                       gc_pos_btree_root(b->btree_id),
1100                       &stats, 0, 0);
1101
1102         if (old && !btree_node_fake(old))
1103                 bch2_btree_node_free_index(as, NULL,
1104                                            bkey_i_to_s_c(&old->key),
1105                                            &stats);
1106         bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1107                             gc_pos_btree_root(b->btree_id));
1108 }
1109
1110 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
1111 {
1112         struct btree_root *r = &c->btree_roots[b->btree_id];
1113
1114         mutex_lock(&c->btree_root_lock);
1115
1116         BUG_ON(b != r->b);
1117         bkey_copy(&r->key, &b->key);
1118         r->level = b->level;
1119         r->alive = true;
1120         if (rw == WRITE)
1121                 c->btree_roots_dirty = true;
1122
1123         mutex_unlock(&c->btree_root_lock);
1124 }
1125
1126 /**
1127  * bch_btree_set_root - update the root in memory and on disk
1128  *
1129  * To ensure forward progress, the current task must not be holding any
1130  * btree node write locks. However, you must hold an intent lock on the
1131  * old root.
1132  *
1133  * Note: This allocates a journal entry but doesn't add any keys to
1134  * it.  All the btree roots are part of every journal write, so there
1135  * is nothing new to be done.  This just guarantees that there is a
1136  * journal write.
1137  */
1138 static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
1139                                 struct btree_iter *iter)
1140 {
1141         struct bch_fs *c = as->c;
1142         struct btree *old;
1143
1144         trace_btree_set_root(c, b);
1145         BUG_ON(!b->written &&
1146                !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
1147
1148         old = btree_node_root(c, b);
1149
1150         /*
1151          * Ensure no one is using the old root while we switch to the
1152          * new root:
1153          */
1154         bch2_btree_node_lock_write(old, iter);
1155
1156         bch2_btree_set_root_inmem(as, b);
1157
1158         btree_update_updated_root(as);
1159
1160         /*
1161          * Unlock old root after new root is visible:
1162          *
1163          * The new root isn't persistent, but that's ok: we still have
1164          * an intent lock on the new root, and any updates that would
1165          * depend on the new root would have to update the new root.
1166          */
1167         bch2_btree_node_unlock_write(old, iter);
1168 }
1169
1170 /* Interior node updates: */
1171
1172 static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
1173                                         struct btree_iter *iter,
1174                                         struct bkey_i *insert,
1175                                         struct btree_node_iter *node_iter)
1176 {
1177         struct bch_fs *c = as->c;
1178         struct bch_fs_usage stats = { 0 };
1179         struct bkey_packed *k;
1180         struct bkey tmp;
1181
1182         BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b));
1183
1184         if (bkey_extent_is_data(&insert->k))
1185                 bch2_mark_key(c, BKEY_TYPE_BTREE,
1186                               bkey_i_to_s_c(insert),
1187                               true, 0,
1188                               gc_pos_btree_node(b), &stats, 0, 0);
1189
1190         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1191                bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
1192                 bch2_btree_node_iter_advance(node_iter, b);
1193
1194         /*
1195          * If we're overwriting, look up pending delete and mark so that gc
1196          * marks it on the pending delete list:
1197          */
1198         if (k && !bkey_cmp_packed(b, k, &insert->k))
1199                 bch2_btree_node_free_index(as, b,
1200                                            bkey_disassemble(b, k, &tmp),
1201                                            &stats);
1202
1203         bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1204                             gc_pos_btree_node(b));
1205
1206         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
1207         set_btree_node_dirty(b);
1208         set_btree_node_need_write(b);
1209 }
1210
1211 /*
1212  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1213  * node)
1214  */
1215 static struct btree *__btree_split_node(struct btree_update *as,
1216                                         struct btree *n1,
1217                                         struct btree_iter *iter)
1218 {
1219         size_t nr_packed = 0, nr_unpacked = 0;
1220         struct btree *n2;
1221         struct bset *set1, *set2;
1222         struct bkey_packed *k, *prev = NULL;
1223
1224         n2 = bch2_btree_node_alloc(as, n1->level);
1225
1226         n2->data->max_key       = n1->data->max_key;
1227         n2->data->format        = n1->format;
1228         SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
1229         n2->key.k.p = n1->key.k.p;
1230
1231         btree_node_set_format(n2, n2->data->format);
1232
1233         set1 = btree_bset_first(n1);
1234         set2 = btree_bset_first(n2);
1235
1236         /*
1237          * Has to be a linear search because we don't have an auxiliary
1238          * search tree yet
1239          */
1240         k = set1->start;
1241         while (1) {
1242                 if (bkey_next(k) == vstruct_last(set1))
1243                         break;
1244                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1245                         break;
1246
1247                 if (bkey_packed(k))
1248                         nr_packed++;
1249                 else
1250                         nr_unpacked++;
1251
1252                 prev = k;
1253                 k = bkey_next(k);
1254         }
1255
1256         BUG_ON(!prev);
1257
1258         n1->key.k.p = bkey_unpack_pos(n1, prev);
1259         n1->data->max_key = n1->key.k.p;
1260         n2->data->min_key =
1261                 btree_type_successor(n1->btree_id, n1->key.k.p);
1262
1263         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1264         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1265
1266         set_btree_bset_end(n1, n1->set);
1267         set_btree_bset_end(n2, n2->set);
1268
1269         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1270         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1271         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1272         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1273
1274         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1275         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1276         n1->nr.packed_keys      = nr_packed;
1277         n1->nr.unpacked_keys    = nr_unpacked;
1278
1279         BUG_ON(!set1->u64s);
1280         BUG_ON(!set2->u64s);
1281
1282         memcpy_u64s(set2->start,
1283                     vstruct_end(set1),
1284                     le16_to_cpu(set2->u64s));
1285
1286         btree_node_reset_sib_u64s(n1);
1287         btree_node_reset_sib_u64s(n2);
1288
1289         bch2_verify_btree_nr_keys(n1);
1290         bch2_verify_btree_nr_keys(n2);
1291
1292         if (n1->level) {
1293                 btree_node_interior_verify(n1);
1294                 btree_node_interior_verify(n2);
1295         }
1296
1297         return n2;
1298 }
1299
1300 /*
1301  * For updates to interior nodes, we've got to do the insert before we split
1302  * because the stuff we're inserting has to be inserted atomically. Post split,
1303  * the keys might have to go in different nodes and the split would no longer be
1304  * atomic.
1305  *
1306  * Worse, if the insert is from btree node coalescing, if we do the insert after
1307  * we do the split (and pick the pivot) - the pivot we pick might be between
1308  * nodes that were coalesced, and thus in the middle of a child node post
1309  * coalescing:
1310  */
1311 static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
1312                                     struct btree_iter *iter,
1313                                     struct keylist *keys)
1314 {
1315         struct btree_node_iter node_iter;
1316         struct bkey_i *k = bch2_keylist_front(keys);
1317         struct bkey_packed *p;
1318         struct bset *i;
1319
1320         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1321
1322         bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1323
1324         while (!bch2_keylist_empty(keys)) {
1325                 k = bch2_keylist_front(keys);
1326
1327                 BUG_ON(bch_keylist_u64s(keys) >
1328                        bch_btree_keys_u64s_remaining(as->c, b));
1329                 BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
1330                 BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
1331
1332                 bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
1333                 bch2_keylist_pop_front(keys);
1334         }
1335
1336         /*
1337          * We can't tolerate whiteouts here - with whiteouts there can be
1338          * duplicate keys, and it would be rather bad if we picked a duplicate
1339          * for the pivot:
1340          */
1341         i = btree_bset_first(b);
1342         p = i->start;
1343         while (p != vstruct_last(i))
1344                 if (bkey_deleted(p)) {
1345                         le16_add_cpu(&i->u64s, -p->u64s);
1346                         set_btree_bset_end(b, b->set);
1347                         memmove_u64s_down(p, bkey_next(p),
1348                                           (u64 *) vstruct_last(i) -
1349                                           (u64 *) p);
1350                 } else
1351                         p = bkey_next(p);
1352
1353         BUG_ON(b->nsets != 1 ||
1354                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1355
1356         btree_node_interior_verify(b);
1357 }
1358
1359 static void btree_split(struct btree_update *as, struct btree *b,
1360                         struct btree_iter *iter, struct keylist *keys,
1361                         unsigned flags)
1362 {
1363         struct bch_fs *c = as->c;
1364         struct btree *parent = btree_node_parent(iter, b);
1365         struct btree *n1, *n2 = NULL, *n3 = NULL;
1366         u64 start_time = local_clock();
1367
1368         BUG_ON(!parent && (b != btree_node_root(c, b)));
1369         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1370
1371         bch2_btree_interior_update_will_free_node(as, b);
1372
1373         n1 = bch2_btree_node_alloc_replacement(as, b);
1374
1375         if (keys)
1376                 btree_split_insert_keys(as, n1, iter, keys);
1377
1378         if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
1379                 trace_btree_split(c, b);
1380
1381                 n2 = __btree_split_node(as, n1, iter);
1382
1383                 bch2_btree_build_aux_trees(n2);
1384                 bch2_btree_build_aux_trees(n1);
1385                 six_unlock_write(&n2->lock);
1386                 six_unlock_write(&n1->lock);
1387
1388                 bch2_btree_node_write(c, n2, SIX_LOCK_intent);
1389
1390                 /*
1391                  * Note that on recursive parent_keys == keys, so we
1392                  * can't start adding new keys to parent_keys before emptying it
1393                  * out (which we did with btree_split_insert_keys() above)
1394                  */
1395                 bch2_keylist_add(&as->parent_keys, &n1->key);
1396                 bch2_keylist_add(&as->parent_keys, &n2->key);
1397
1398                 if (!parent) {
1399                         /* Depth increases, make a new root */
1400                         n3 = __btree_root_alloc(as, b->level + 1);
1401
1402                         n3->sib_u64s[0] = U16_MAX;
1403                         n3->sib_u64s[1] = U16_MAX;
1404
1405                         btree_split_insert_keys(as, n3, iter, &as->parent_keys);
1406
1407                         bch2_btree_node_write(c, n3, SIX_LOCK_intent);
1408                 }
1409         } else {
1410                 trace_btree_compact(c, b);
1411
1412                 bch2_btree_build_aux_trees(n1);
1413                 six_unlock_write(&n1->lock);
1414
1415                 bch2_keylist_add(&as->parent_keys, &n1->key);
1416         }
1417
1418         bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1419
1420         /* New nodes all written, now make them visible: */
1421
1422         if (parent) {
1423                 /* Split a non root node */
1424                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1425         } else if (n3) {
1426                 bch2_btree_set_root(as, n3, iter);
1427         } else {
1428                 /* Root filled up but didn't need to be split */
1429                 bch2_btree_set_root(as, n1, iter);
1430         }
1431
1432         bch2_open_buckets_put(c, &n1->ob);
1433         if (n2)
1434                 bch2_open_buckets_put(c, &n2->ob);
1435         if (n3)
1436                 bch2_open_buckets_put(c, &n3->ob);
1437
1438         /*
1439          * Note - at this point other linked iterators could still have @b read
1440          * locked; we're depending on the bch2_btree_iter_node_replace() calls
1441          * below removing all references to @b so we don't return with other
1442          * iterators pointing to a node they have locked that's been freed.
1443          *
1444          * We have to free the node first because the bch2_iter_node_replace()
1445          * calls will drop _our_ iterator's reference - and intent lock - to @b.
1446          */
1447         bch2_btree_node_free_inmem(c, b, iter);
1448
1449         /* Successful split, update the iterator to point to the new nodes: */
1450
1451         if (n3)
1452                 bch2_btree_iter_node_replace(iter, n3);
1453         if (n2)
1454                 bch2_btree_iter_node_replace(iter, n2);
1455         bch2_btree_iter_node_replace(iter, n1);
1456
1457         bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
1458 }
1459
1460 static void
1461 bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
1462                                 struct btree_iter *iter, struct keylist *keys)
1463 {
1464         struct btree_iter *linked;
1465         struct btree_node_iter node_iter;
1466         struct bkey_i *insert = bch2_keylist_front(keys);
1467         struct bkey_packed *k;
1468
1469         /* Don't screw up @iter's position: */
1470         node_iter = iter->l[b->level].iter;
1471
1472         /*
1473          * btree_split(), btree_gc_coalesce() will insert keys before
1474          * the iterator's current position - they know the keys go in
1475          * the node the iterator points to:
1476          */
1477         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1478                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1479                 ;
1480
1481         while (!bch2_keylist_empty(keys)) {
1482                 insert = bch2_keylist_front(keys);
1483
1484                 bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
1485                 bch2_keylist_pop_front(keys);
1486         }
1487
1488         btree_update_updated_node(as, b);
1489
1490         for_each_btree_iter_with_node(iter, b, linked)
1491                 bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
1492
1493         bch2_btree_iter_verify(iter, b);
1494 }
1495
1496 /**
1497  * bch_btree_insert_node - insert bkeys into a given btree node
1498  *
1499  * @iter:               btree iterator
1500  * @keys:               list of keys to insert
1501  * @hook:               insert callback
1502  * @persistent:         if not null, @persistent will wait on journal write
1503  *
1504  * Inserts as many keys as it can into a given btree node, splitting it if full.
1505  * If a split occurred, this function will return early. This can only happen
1506  * for leaf nodes -- inserts into interior nodes have to be atomic.
1507  */
1508 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
1509                             struct btree_iter *iter, struct keylist *keys,
1510                             unsigned flags)
1511 {
1512         struct bch_fs *c = as->c;
1513         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1514         int old_live_u64s = b->nr.live_u64s;
1515         int live_u64s_added, u64s_added;
1516
1517         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1518         BUG_ON(!b->level);
1519         BUG_ON(!as || as->b);
1520         bch2_verify_keylist_sorted(keys);
1521
1522         if (as->must_rewrite)
1523                 goto split;
1524
1525         bch2_btree_node_lock_for_insert(c, b, iter);
1526
1527         if (!bch2_btree_node_insert_fits(c, b, bch_keylist_u64s(keys))) {
1528                 bch2_btree_node_unlock_write(b, iter);
1529                 goto split;
1530         }
1531
1532         bch2_btree_insert_keys_interior(as, b, iter, keys);
1533
1534         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1535         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1536
1537         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1538                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1539         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1540                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1541
1542         if (u64s_added > live_u64s_added &&
1543             bch2_maybe_compact_whiteouts(c, b))
1544                 bch2_btree_iter_reinit_node(iter, b);
1545
1546         bch2_btree_node_unlock_write(b, iter);
1547
1548         btree_node_interior_verify(b);
1549
1550         /*
1551          * when called from the btree_split path the new nodes aren't added to
1552          * the btree iterator yet, so the merge path's unlock/wait/relock dance
1553          * won't work:
1554          */
1555         bch2_foreground_maybe_merge(c, iter, b->level,
1556                                     flags|BTREE_INSERT_NOUNLOCK);
1557         return;
1558 split:
1559         btree_split(as, b, iter, keys, flags);
1560 }
1561
1562 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
1563                           unsigned flags)
1564 {
1565         struct btree *b = iter->l[0].b;
1566         struct btree_update *as;
1567         struct closure cl;
1568         int ret = 0;
1569         struct btree_iter *linked;
1570
1571         /*
1572          * We already have a disk reservation and open buckets pinned; this
1573          * allocation must not block:
1574          */
1575         for_each_btree_iter(iter, linked)
1576                 if (linked->btree_id == BTREE_ID_EXTENTS)
1577                         flags |= BTREE_INSERT_USE_RESERVE;
1578
1579         closure_init_stack(&cl);
1580
1581         /* Hack, because gc and splitting nodes doesn't mix yet: */
1582         if (!down_read_trylock(&c->gc_lock)) {
1583                 if (flags & BTREE_INSERT_NOUNLOCK)
1584                         return -EINTR;
1585
1586                 bch2_btree_iter_unlock(iter);
1587                 down_read(&c->gc_lock);
1588
1589                 if (btree_iter_linked(iter))
1590                         ret = -EINTR;
1591         }
1592
1593         /*
1594          * XXX: figure out how far we might need to split,
1595          * instead of locking/reserving all the way to the root:
1596          */
1597         if (!bch2_btree_iter_upgrade(iter, U8_MAX,
1598                         !(flags & BTREE_INSERT_NOUNLOCK))) {
1599                 ret = -EINTR;
1600                 goto out;
1601         }
1602
1603         as = bch2_btree_update_start(c, iter->btree_id,
1604                 btree_update_reserve_required(c, b), flags,
1605                 !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1606         if (IS_ERR(as)) {
1607                 ret = PTR_ERR(as);
1608                 if (ret == -EAGAIN) {
1609                         BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
1610                         bch2_btree_iter_unlock(iter);
1611                         ret = -EINTR;
1612                 }
1613                 goto out;
1614         }
1615
1616         btree_split(as, b, iter, NULL, flags);
1617         bch2_btree_update_done(as);
1618
1619         /*
1620          * We haven't successfully inserted yet, so don't downgrade all the way
1621          * back to read locks;
1622          */
1623         __bch2_btree_iter_downgrade(iter, 1);
1624 out:
1625         up_read(&c->gc_lock);
1626         closure_sync(&cl);
1627         return ret;
1628 }
1629
1630 void __bch2_foreground_maybe_merge(struct bch_fs *c,
1631                                    struct btree_iter *iter,
1632                                    unsigned level,
1633                                    unsigned flags,
1634                                    enum btree_node_sibling sib)
1635 {
1636         struct btree_update *as;
1637         struct bkey_format_state new_s;
1638         struct bkey_format new_f;
1639         struct bkey_i delete;
1640         struct btree *b, *m, *n, *prev, *next, *parent;
1641         struct closure cl;
1642         size_t sib_u64s;
1643         int ret = 0;
1644
1645         closure_init_stack(&cl);
1646 retry:
1647         BUG_ON(!btree_node_locked(iter, level));
1648
1649         b = iter->l[level].b;
1650
1651         parent = btree_node_parent(iter, b);
1652         if (!parent)
1653                 goto out;
1654
1655         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1656                 goto out;
1657
1658         /* XXX: can't be holding read locks */
1659         m = bch2_btree_node_get_sibling(c, iter, b,
1660                         !(flags & BTREE_INSERT_NOUNLOCK), sib);
1661         if (IS_ERR(m)) {
1662                 ret = PTR_ERR(m);
1663                 goto err;
1664         }
1665
1666         /* NULL means no sibling: */
1667         if (!m) {
1668                 b->sib_u64s[sib] = U16_MAX;
1669                 goto out;
1670         }
1671
1672         if (sib == btree_prev_sib) {
1673                 prev = m;
1674                 next = b;
1675         } else {
1676                 prev = b;
1677                 next = m;
1678         }
1679
1680         bch2_bkey_format_init(&new_s);
1681         __bch2_btree_calc_format(&new_s, b);
1682         __bch2_btree_calc_format(&new_s, m);
1683         new_f = bch2_bkey_format_done(&new_s);
1684
1685         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1686                 btree_node_u64s_with_format(m, &new_f);
1687
1688         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1689                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1690                 sib_u64s /= 2;
1691                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1692         }
1693
1694         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1695         b->sib_u64s[sib] = sib_u64s;
1696
1697         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1698                 six_unlock_intent(&m->lock);
1699                 goto out;
1700         }
1701
1702         /* We're changing btree topology, doesn't mix with gc: */
1703         if (!down_read_trylock(&c->gc_lock))
1704                 goto err_cycle_gc_lock;
1705
1706         if (!bch2_btree_iter_upgrade(iter, U8_MAX,
1707                         !(flags & BTREE_INSERT_NOUNLOCK))) {
1708                 ret = -EINTR;
1709                 goto err_unlock;
1710         }
1711
1712         as = bch2_btree_update_start(c, iter->btree_id,
1713                          btree_update_reserve_required(c, parent) + 1,
1714                          BTREE_INSERT_NOFAIL|
1715                          BTREE_INSERT_USE_RESERVE,
1716                          !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
1717         if (IS_ERR(as)) {
1718                 ret = PTR_ERR(as);
1719                 goto err_unlock;
1720         }
1721
1722         trace_btree_merge(c, b);
1723
1724         bch2_btree_interior_update_will_free_node(as, b);
1725         bch2_btree_interior_update_will_free_node(as, m);
1726
1727         n = bch2_btree_node_alloc(as, b->level);
1728
1729         n->data->min_key        = prev->data->min_key;
1730         n->data->max_key        = next->data->max_key;
1731         n->data->format         = new_f;
1732         n->key.k.p              = next->key.k.p;
1733
1734         btree_node_set_format(n, new_f);
1735
1736         bch2_btree_sort_into(c, n, prev);
1737         bch2_btree_sort_into(c, n, next);
1738
1739         bch2_btree_build_aux_trees(n);
1740         six_unlock_write(&n->lock);
1741
1742         bkey_init(&delete.k);
1743         delete.k.p = prev->key.k.p;
1744         bch2_keylist_add(&as->parent_keys, &delete);
1745         bch2_keylist_add(&as->parent_keys, &n->key);
1746
1747         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1748
1749         bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1750
1751         bch2_open_buckets_put(c, &n->ob);
1752         bch2_btree_node_free_inmem(c, b, iter);
1753         bch2_btree_node_free_inmem(c, m, iter);
1754         bch2_btree_iter_node_replace(iter, n);
1755
1756         bch2_btree_iter_verify(iter, n);
1757
1758         bch2_btree_update_done(as);
1759
1760         six_unlock_intent(&m->lock);
1761         up_read(&c->gc_lock);
1762 out:
1763         /*
1764          * Don't downgrade locks here: we're called after successful insert,
1765          * and the caller will downgrade locks after a successful insert
1766          * anyways (in case e.g. a split was required first)
1767          *
1768          * And we're also called when inserting into interior nodes in the
1769          * split path, and downgrading to read locks in there is potentially
1770          * confusing:
1771          */
1772         closure_sync(&cl);
1773         return;
1774
1775 err_cycle_gc_lock:
1776         six_unlock_intent(&m->lock);
1777
1778         if (flags & BTREE_INSERT_NOUNLOCK)
1779                 goto out;
1780
1781         bch2_btree_iter_unlock(iter);
1782
1783         down_read(&c->gc_lock);
1784         up_read(&c->gc_lock);
1785         ret = -EINTR;
1786         goto err;
1787
1788 err_unlock:
1789         six_unlock_intent(&m->lock);
1790         up_read(&c->gc_lock);
1791 err:
1792         BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
1793
1794         if ((ret == -EAGAIN || ret == -EINTR) &&
1795             !(flags & BTREE_INSERT_NOUNLOCK)) {
1796                 bch2_btree_iter_unlock(iter);
1797                 closure_sync(&cl);
1798                 ret = bch2_btree_iter_traverse(iter);
1799                 if (ret)
1800                         goto out;
1801
1802                 goto retry;
1803         }
1804
1805         goto out;
1806 }
1807
1808 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1809                                 struct btree *b, unsigned flags,
1810                                 struct closure *cl)
1811 {
1812         struct btree *n, *parent = btree_node_parent(iter, b);
1813         struct btree_update *as;
1814
1815         as = bch2_btree_update_start(c, iter->btree_id,
1816                 (parent
1817                  ? btree_update_reserve_required(c, parent)
1818                  : 0) + 1,
1819                 flags, cl);
1820         if (IS_ERR(as)) {
1821                 trace_btree_gc_rewrite_node_fail(c, b);
1822                 return PTR_ERR(as);
1823         }
1824
1825         bch2_btree_interior_update_will_free_node(as, b);
1826
1827         n = bch2_btree_node_alloc_replacement(as, b);
1828
1829         bch2_btree_build_aux_trees(n);
1830         six_unlock_write(&n->lock);
1831
1832         trace_btree_gc_rewrite_node(c, b);
1833
1834         bch2_btree_node_write(c, n, SIX_LOCK_intent);
1835
1836         if (parent) {
1837                 bch2_keylist_add(&as->parent_keys, &n->key);
1838                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
1839         } else {
1840                 bch2_btree_set_root(as, n, iter);
1841         }
1842
1843         bch2_open_buckets_put(c, &n->ob);
1844
1845         bch2_btree_node_free_inmem(c, b, iter);
1846
1847         bch2_btree_iter_node_replace(iter, n);
1848
1849         bch2_btree_update_done(as);
1850         return 0;
1851 }
1852
1853 /**
1854  * bch_btree_node_rewrite - Rewrite/move a btree node
1855  *
1856  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
1857  * btree_check_reserve() has to wait)
1858  */
1859 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
1860                             __le64 seq, unsigned flags)
1861 {
1862         struct closure cl;
1863         struct btree *b;
1864         int ret;
1865
1866         flags |= BTREE_INSERT_NOFAIL;
1867
1868         closure_init_stack(&cl);
1869
1870         bch2_btree_iter_upgrade(iter, U8_MAX, true);
1871
1872         if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
1873                 if (!down_read_trylock(&c->gc_lock)) {
1874                         bch2_btree_iter_unlock(iter);
1875                         down_read(&c->gc_lock);
1876                 }
1877         }
1878
1879         while (1) {
1880                 ret = bch2_btree_iter_traverse(iter);
1881                 if (ret)
1882                         break;
1883
1884                 b = bch2_btree_iter_peek_node(iter);
1885                 if (!b || b->data->keys.seq != seq)
1886                         break;
1887
1888                 ret = __btree_node_rewrite(c, iter, b, flags, &cl);
1889                 if (ret != -EAGAIN &&
1890                     ret != -EINTR)
1891                         break;
1892
1893                 bch2_btree_iter_unlock(iter);
1894                 closure_sync(&cl);
1895         }
1896
1897         bch2_btree_iter_downgrade(iter);
1898
1899         if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
1900                 up_read(&c->gc_lock);
1901
1902         closure_sync(&cl);
1903         return ret;
1904 }
1905
1906 static void __bch2_btree_node_update_key(struct bch_fs *c,
1907                                          struct btree_update *as,
1908                                          struct btree_iter *iter,
1909                                          struct btree *b, struct btree *new_hash,
1910                                          struct bkey_i_extent *new_key)
1911 {
1912         struct btree *parent;
1913         int ret;
1914
1915         /*
1916          * Two corner cases that need to be thought about here:
1917          *
1918          * @b may not be reachable yet - there might be another interior update
1919          * operation waiting on @b to be written, and we're gonna deliver the
1920          * write completion to that interior update operation _before_
1921          * persisting the new_key update
1922          *
1923          * That ends up working without us having to do anything special here:
1924          * the reason is, we do kick off (and do the in memory updates) for the
1925          * update for @new_key before we return, creating a new interior_update
1926          * operation here.
1927          *
1928          * The new interior update operation here will in effect override the
1929          * previous one. The previous one was going to terminate - make @b
1930          * reachable - in one of two ways:
1931          * - updating the btree root pointer
1932          *   In that case,
1933          *   no, this doesn't work. argh.
1934          */
1935
1936         if (b->will_make_reachable)
1937                 as->must_rewrite = true;
1938
1939         btree_interior_update_add_node_reference(as, b);
1940
1941         parent = btree_node_parent(iter, b);
1942         if (parent) {
1943                 if (new_hash) {
1944                         bkey_copy(&new_hash->key, &new_key->k_i);
1945                         ret = bch2_btree_node_hash_insert(&c->btree_cache,
1946                                         new_hash, b->level, b->btree_id);
1947                         BUG_ON(ret);
1948                 }
1949
1950                 bch2_keylist_add(&as->parent_keys, &new_key->k_i);
1951                 bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
1952
1953                 if (new_hash) {
1954                         mutex_lock(&c->btree_cache.lock);
1955                         bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
1956
1957                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1958
1959                         bkey_copy(&b->key, &new_key->k_i);
1960                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1961                         BUG_ON(ret);
1962                         mutex_unlock(&c->btree_cache.lock);
1963                 } else {
1964                         bkey_copy(&b->key, &new_key->k_i);
1965                 }
1966         } else {
1967                 struct bch_fs_usage stats = { 0 };
1968
1969                 BUG_ON(btree_node_root(c, b) != b);
1970
1971                 bch2_btree_node_lock_write(b, iter);
1972
1973                 bch2_mark_key(c, BKEY_TYPE_BTREE,
1974                               bkey_i_to_s_c(&new_key->k_i),
1975                               true, 0,
1976                               gc_pos_btree_root(b->btree_id),
1977                               &stats, 0, 0);
1978                 bch2_btree_node_free_index(as, NULL,
1979                                            bkey_i_to_s_c(&b->key),
1980                                            &stats);
1981                 bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
1982                                     gc_pos_btree_root(b->btree_id));
1983
1984                 if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
1985                         mutex_lock(&c->btree_cache.lock);
1986                         bch2_btree_node_hash_remove(&c->btree_cache, b);
1987
1988                         bkey_copy(&b->key, &new_key->k_i);
1989                         ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
1990                         BUG_ON(ret);
1991                         mutex_unlock(&c->btree_cache.lock);
1992                 } else {
1993                         bkey_copy(&b->key, &new_key->k_i);
1994                 }
1995
1996                 btree_update_updated_root(as);
1997                 bch2_btree_node_unlock_write(b, iter);
1998         }
1999
2000         bch2_btree_update_done(as);
2001 }
2002
2003 int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
2004                                struct btree *b, struct bkey_i_extent *new_key)
2005 {
2006         struct btree *parent = btree_node_parent(iter, b);
2007         struct btree_update *as = NULL;
2008         struct btree *new_hash = NULL;
2009         struct closure cl;
2010         int ret;
2011
2012         closure_init_stack(&cl);
2013
2014         if (!bch2_btree_iter_upgrade(iter, U8_MAX, true))
2015                 return -EINTR;
2016
2017         if (!down_read_trylock(&c->gc_lock)) {
2018                 bch2_btree_iter_unlock(iter);
2019                 down_read(&c->gc_lock);
2020
2021                 if (!bch2_btree_iter_relock(iter)) {
2022                         ret = -EINTR;
2023                         goto err;
2024                 }
2025         }
2026
2027         /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
2028         if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
2029                 /* bch2_btree_reserve_get will unlock */
2030                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2031                 if (ret) {
2032                         ret = -EINTR;
2033
2034                         bch2_btree_iter_unlock(iter);
2035                         up_read(&c->gc_lock);
2036                         closure_sync(&cl);
2037                         down_read(&c->gc_lock);
2038
2039                         if (!bch2_btree_iter_relock(iter))
2040                                 goto err;
2041                 }
2042
2043                 new_hash = bch2_btree_node_mem_alloc(c);
2044         }
2045
2046         as = bch2_btree_update_start(c, iter->btree_id,
2047                 parent ? btree_update_reserve_required(c, parent) : 0,
2048                 BTREE_INSERT_NOFAIL|
2049                 BTREE_INSERT_USE_RESERVE|
2050                 BTREE_INSERT_USE_ALLOC_RESERVE,
2051                 &cl);
2052
2053         if (IS_ERR(as)) {
2054                 ret = PTR_ERR(as);
2055                 if (ret == -EAGAIN)
2056                         ret = -EINTR;
2057
2058                 if (ret != -EINTR)
2059                         goto err;
2060
2061                 bch2_btree_iter_unlock(iter);
2062                 up_read(&c->gc_lock);
2063                 closure_sync(&cl);
2064                 down_read(&c->gc_lock);
2065
2066                 if (!bch2_btree_iter_relock(iter))
2067                         goto err;
2068         }
2069
2070         ret = bch2_mark_bkey_replicas(c, BKEY_TYPE_BTREE,
2071                                       extent_i_to_s_c(new_key).s_c);
2072         if (ret)
2073                 goto err_free_update;
2074
2075         __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
2076
2077         bch2_btree_iter_downgrade(iter);
2078 err:
2079         if (new_hash) {
2080                 mutex_lock(&c->btree_cache.lock);
2081                 list_move(&new_hash->list, &c->btree_cache.freeable);
2082                 mutex_unlock(&c->btree_cache.lock);
2083
2084                 six_unlock_write(&new_hash->lock);
2085                 six_unlock_intent(&new_hash->lock);
2086         }
2087         up_read(&c->gc_lock);
2088         closure_sync(&cl);
2089         return ret;
2090 err_free_update:
2091         bch2_btree_update_free(as);
2092         goto err;
2093 }
2094
2095 /* Init code: */
2096
2097 /*
2098  * Only for filesystem bringup, when first reading the btree roots or allocating
2099  * btree roots when initializing a new filesystem:
2100  */
2101 void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
2102 {
2103         BUG_ON(btree_node_root(c, b));
2104
2105         __bch2_btree_set_root_inmem(c, b);
2106         bch2_btree_set_root_ondisk(c, b, READ);
2107 }
2108
2109 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
2110 {
2111         struct closure cl;
2112         struct btree *b;
2113         int ret;
2114
2115         closure_init_stack(&cl);
2116
2117         do {
2118                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
2119                 closure_sync(&cl);
2120         } while (ret);
2121
2122         b = bch2_btree_node_mem_alloc(c);
2123         bch2_btree_cache_cannibalize_unlock(c);
2124
2125         set_btree_node_fake(b);
2126         b->level        = 0;
2127         b->btree_id     = id;
2128
2129         bkey_extent_init(&b->key);
2130         b->key.k.p = POS_MAX;
2131         bkey_i_to_extent(&b->key)->v._data[0] = U64_MAX - id;
2132
2133         bch2_bset_init_first(b, &b->data->keys);
2134         bch2_btree_build_aux_trees(b);
2135
2136         b->data->min_key = POS_MIN;
2137         b->data->max_key = POS_MAX;
2138         b->data->format = bch2_btree_calc_format(b);
2139         btree_node_set_format(b, b->data->format);
2140
2141         ret = bch2_btree_node_hash_insert(&c->btree_cache, b, b->level, b->btree_id);
2142         BUG_ON(ret);
2143
2144         __bch2_btree_set_root_inmem(c, b);
2145
2146         six_unlock_write(&b->lock);
2147         six_unlock_intent(&b->lock);
2148 }
2149
2150 ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
2151 {
2152         struct printbuf out = _PBUF(buf, PAGE_SIZE);
2153         struct btree_update *as;
2154
2155         mutex_lock(&c->btree_interior_update_lock);
2156         list_for_each_entry(as, &c->btree_interior_update_list, list)
2157                 pr_buf(&out, "%p m %u w %u r %u j %llu\n",
2158                        as,
2159                        as->mode,
2160                        as->nodes_written,
2161                        atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
2162                        as->journal.seq);
2163         mutex_unlock(&c->btree_interior_update_lock);
2164
2165         return out.pos - buf;
2166 }
2167
2168 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
2169 {
2170         size_t ret = 0;
2171         struct list_head *i;
2172
2173         mutex_lock(&c->btree_interior_update_lock);
2174         list_for_each(i, &c->btree_interior_update_list)
2175                 ret++;
2176         mutex_unlock(&c->btree_interior_update_lock);
2177
2178         return ret;
2179 }