]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_update.c
Rename from bcache-tools to bcachefs-tools
[bcachefs-tools-debian] / libbcachefs / btree_update.c
1
2 #include "bcachefs.h"
3 #include "alloc.h"
4 #include "bkey_methods.h"
5 #include "btree_cache.h"
6 #include "btree_gc.h"
7 #include "btree_update.h"
8 #include "btree_io.h"
9 #include "btree_iter.h"
10 #include "btree_locking.h"
11 #include "buckets.h"
12 #include "extents.h"
13 #include "journal.h"
14 #include "keylist.h"
15 #include "super-io.h"
16
17 #include <linux/random.h>
18 #include <linux/sort.h>
19 #include <trace/events/bcachefs.h>
20
21 static void btree_interior_update_updated_root(struct bch_fs *,
22                                                struct btree_interior_update *,
23                                                enum btree_id);
24
25 /* Calculate ideal packed bkey format for new btree nodes: */
26
27 void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
28 {
29         struct bkey_packed *k;
30         struct bset_tree *t;
31         struct bkey uk;
32
33         bch2_bkey_format_add_pos(s, b->data->min_key);
34
35         for_each_bset(b, t)
36                 for (k = btree_bkey_first(b, t);
37                      k != btree_bkey_last(b, t);
38                      k = bkey_next(k))
39                         if (!bkey_whiteout(k)) {
40                                 uk = bkey_unpack_key(b, k);
41                                 bch2_bkey_format_add_key(s, &uk);
42                         }
43 }
44
45 static struct bkey_format bch2_btree_calc_format(struct btree *b)
46 {
47         struct bkey_format_state s;
48
49         bch2_bkey_format_init(&s);
50         __bch2_btree_calc_format(&s, b);
51
52         return bch2_bkey_format_done(&s);
53 }
54
55 static size_t btree_node_u64s_with_format(struct btree *b,
56                                           struct bkey_format *new_f)
57 {
58         struct bkey_format *old_f = &b->format;
59
60         /* stupid integer promotion rules */
61         ssize_t delta =
62             (((int) new_f->key_u64s - old_f->key_u64s) *
63              (int) b->nr.packed_keys) +
64             (((int) new_f->key_u64s - BKEY_U64s) *
65              (int) b->nr.unpacked_keys);
66
67         BUG_ON(delta + b->nr.live_u64s < 0);
68
69         return b->nr.live_u64s + delta;
70 }
71
72 /**
73  * btree_node_format_fits - check if we could rewrite node with a new format
74  *
75  * This assumes all keys can pack with the new format -- it just checks if
76  * the re-packed keys would fit inside the node itself.
77  */
78 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
79                                 struct bkey_format *new_f)
80 {
81         size_t u64s = btree_node_u64s_with_format(b, new_f);
82
83         return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
84 }
85
86 /* Btree node freeing/allocation: */
87
88 /*
89  * We're doing the index update that makes @b unreachable, update stuff to
90  * reflect that:
91  *
92  * Must be called _before_ btree_interior_update_updated_root() or
93  * btree_interior_update_updated_btree:
94  */
95 static void bch2_btree_node_free_index(struct bch_fs *c, struct btree *b,
96                                       enum btree_id id, struct bkey_s_c k,
97                                       struct bch_fs_usage *stats)
98 {
99         struct btree_interior_update *as;
100         struct pending_btree_node_free *d;
101
102         mutex_lock(&c->btree_interior_update_lock);
103
104         for_each_pending_btree_node_free(c, as, d)
105                 if (!bkey_cmp(k.k->p, d->key.k.p) &&
106                     bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) &&
107                     !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k)))
108                         goto found;
109
110         BUG();
111 found:
112         d->index_update_done = true;
113
114         /*
115          * Btree nodes are accounted as freed in bch_alloc_stats when they're
116          * freed from the index:
117          */
118         stats->s[S_COMPRESSED][S_META]   -= c->sb.btree_node_size;
119         stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size;
120
121         /*
122          * We're dropping @k from the btree, but it's still live until the
123          * index update is persistent so we need to keep a reference around for
124          * mark and sweep to find - that's primarily what the
125          * btree_node_pending_free list is for.
126          *
127          * So here (when we set index_update_done = true), we're moving an
128          * existing reference to a different part of the larger "gc keyspace" -
129          * and the new position comes after the old position, since GC marks
130          * the pending free list after it walks the btree.
131          *
132          * If we move the reference while mark and sweep is _between_ the old
133          * and the new position, mark and sweep will see the reference twice
134          * and it'll get double accounted - so check for that here and subtract
135          * to cancel out one of mark and sweep's markings if necessary:
136          */
137
138         /*
139          * bch2_mark_key() compares the current gc pos to the pos we're
140          * moving this reference from, hence one comparison here:
141          */
142         if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
143                 struct bch_fs_usage tmp = { 0 };
144
145                 bch2_mark_key(c, bkey_i_to_s_c(&d->key),
146                              -c->sb.btree_node_size, true, b
147                              ? gc_pos_btree_node(b)
148                              : gc_pos_btree_root(id),
149                              &tmp, 0);
150                 /*
151                  * Don't apply tmp - pending deletes aren't tracked in
152                  * bch_alloc_stats:
153                  */
154         }
155
156         mutex_unlock(&c->btree_interior_update_lock);
157 }
158
159 static void __btree_node_free(struct bch_fs *c, struct btree *b,
160                               struct btree_iter *iter)
161 {
162         trace_btree_node_free(c, b);
163
164         BUG_ON(b == btree_node_root(c, b));
165         BUG_ON(b->ob);
166         BUG_ON(!list_empty(&b->write_blocked));
167
168         six_lock_write(&b->lock);
169
170         if (btree_node_dirty(b))
171                 bch2_btree_complete_write(c, b, btree_current_write(b));
172         clear_btree_node_dirty(b);
173
174         bch2_btree_node_hash_remove(c, b);
175
176         mutex_lock(&c->btree_cache_lock);
177         list_move(&b->list, &c->btree_cache_freeable);
178         mutex_unlock(&c->btree_cache_lock);
179
180         /*
181          * By using six_unlock_write() directly instead of
182          * bch2_btree_node_unlock_write(), we don't update the iterator's
183          * sequence numbers and cause future bch2_btree_node_relock() calls to
184          * fail:
185          */
186         six_unlock_write(&b->lock);
187 }
188
189 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
190 {
191         struct open_bucket *ob = b->ob;
192
193         b->ob = NULL;
194
195         __btree_node_free(c, b, NULL);
196
197         bch2_open_bucket_put(c, ob);
198 }
199
200 void bch2_btree_node_free_inmem(struct btree_iter *iter, struct btree *b)
201 {
202         bch2_btree_iter_node_drop_linked(iter, b);
203
204         __btree_node_free(iter->c, b, iter);
205
206         bch2_btree_iter_node_drop(iter, b);
207 }
208
209 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
210                                        struct pending_btree_node_free *pending)
211 {
212         struct bch_fs_usage stats = { 0 };
213
214         BUG_ON(!pending->index_update_done);
215
216         bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
217                      -c->sb.btree_node_size, true,
218                      gc_phase(GC_PHASE_PENDING_DELETE),
219                      &stats, 0);
220         /*
221          * Don't apply stats - pending deletes aren't tracked in
222          * bch_alloc_stats:
223          */
224 }
225
226 void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
227 {
228         bch2_open_bucket_put(c, b->ob);
229         b->ob = NULL;
230 }
231
232 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
233                                             bool use_reserve,
234                                             struct disk_reservation *res,
235                                             struct closure *cl)
236 {
237         BKEY_PADDED(k) tmp;
238         struct open_bucket *ob;
239         struct btree *b;
240         unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE;
241
242         mutex_lock(&c->btree_reserve_cache_lock);
243         if (c->btree_reserve_cache_nr > reserve) {
244                 struct btree_alloc *a =
245                         &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
246
247                 ob = a->ob;
248                 bkey_copy(&tmp.k, &a->k);
249                 mutex_unlock(&c->btree_reserve_cache_lock);
250                 goto mem_alloc;
251         }
252         mutex_unlock(&c->btree_reserve_cache_lock);
253
254 retry:
255         /* alloc_sectors is weird, I suppose */
256         bkey_extent_init(&tmp.k);
257         tmp.k.k.size = c->sb.btree_node_size,
258
259         ob = bch2_alloc_sectors(c, &c->btree_write_point,
260                                bkey_i_to_extent(&tmp.k),
261                                res->nr_replicas,
262                                c->opts.metadata_replicas_required,
263                                use_reserve ? RESERVE_BTREE : RESERVE_NONE,
264                                cl);
265         if (IS_ERR(ob))
266                 return ERR_CAST(ob);
267
268         if (tmp.k.k.size < c->sb.btree_node_size) {
269                 bch2_open_bucket_put(c, ob);
270                 goto retry;
271         }
272 mem_alloc:
273         b = bch2_btree_node_mem_alloc(c);
274
275         /* we hold cannibalize_lock: */
276         BUG_ON(IS_ERR(b));
277         BUG_ON(b->ob);
278
279         bkey_copy(&b->key, &tmp.k);
280         b->key.k.size = 0;
281         b->ob = ob;
282
283         return b;
284 }
285
286 static struct btree *bch2_btree_node_alloc(struct bch_fs *c,
287                                           unsigned level, enum btree_id id,
288                                           struct btree_reserve *reserve)
289 {
290         struct btree *b;
291
292         BUG_ON(!reserve->nr);
293
294         b = reserve->b[--reserve->nr];
295
296         BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
297
298         set_btree_node_accessed(b);
299         set_btree_node_dirty(b);
300
301         bch2_bset_init_first(b, &b->data->keys);
302         memset(&b->nr, 0, sizeof(b->nr));
303         b->data->magic = cpu_to_le64(bset_magic(c));
304         b->data->flags = 0;
305         SET_BTREE_NODE_ID(b->data, id);
306         SET_BTREE_NODE_LEVEL(b->data, level);
307         b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr;
308
309         bch2_btree_build_aux_trees(b);
310
311         bch2_check_mark_super(c, &b->key, true);
312
313         trace_btree_node_alloc(c, b);
314         return b;
315 }
316
317 struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c,
318                                                   struct btree *b,
319                                                   struct bkey_format format,
320                                                   struct btree_reserve *reserve)
321 {
322         struct btree *n;
323
324         n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
325
326         n->data->min_key        = b->data->min_key;
327         n->data->max_key        = b->data->max_key;
328         n->data->format         = format;
329
330         btree_node_set_format(n, format);
331
332         bch2_btree_sort_into(c, n, b);
333
334         btree_node_reset_sib_u64s(n);
335
336         n->key.k.p = b->key.k.p;
337         return n;
338 }
339
340 static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c,
341                                                 struct btree *b,
342                                                 struct btree_reserve *reserve)
343 {
344         struct bkey_format new_f = bch2_btree_calc_format(b);
345
346         /*
347          * The keys might expand with the new format - if they wouldn't fit in
348          * the btree node anymore, use the old format for now:
349          */
350         if (!bch2_btree_node_format_fits(c, b, &new_f))
351                 new_f = b->format;
352
353         return __bch2_btree_node_alloc_replacement(c, b, new_f, reserve);
354 }
355
356 static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b,
357                                      struct btree_reserve *btree_reserve)
358 {
359         struct btree *old = btree_node_root(c, b);
360
361         /* Root nodes cannot be reaped */
362         mutex_lock(&c->btree_cache_lock);
363         list_del_init(&b->list);
364         mutex_unlock(&c->btree_cache_lock);
365
366         mutex_lock(&c->btree_root_lock);
367         btree_node_root(c, b) = b;
368         mutex_unlock(&c->btree_root_lock);
369
370         if (btree_reserve) {
371                 /*
372                  * New allocation (we're not being called because we're in
373                  * bch2_btree_root_read()) - do marking while holding
374                  * btree_root_lock:
375                  */
376                 struct bch_fs_usage stats = { 0 };
377
378                 bch2_mark_key(c, bkey_i_to_s_c(&b->key),
379                              c->sb.btree_node_size, true,
380                              gc_pos_btree_root(b->btree_id),
381                              &stats, 0);
382
383                 if (old)
384                         bch2_btree_node_free_index(c, NULL, old->btree_id,
385                                                   bkey_i_to_s_c(&old->key),
386                                                   &stats);
387                 bch2_fs_usage_apply(c, &stats, &btree_reserve->disk_res,
388                                    gc_pos_btree_root(b->btree_id));
389         }
390
391         bch2_recalc_btree_reserve(c);
392 }
393
394 static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
395 {
396         struct btree_root *r = &c->btree_roots[b->btree_id];
397
398         mutex_lock(&c->btree_root_lock);
399
400         BUG_ON(b != r->b);
401         bkey_copy(&r->key, &b->key);
402         r->level = b->level;
403         r->alive = true;
404
405         mutex_unlock(&c->btree_root_lock);
406 }
407
408 /*
409  * Only for filesystem bringup, when first reading the btree roots or allocating
410  * btree roots when initializing a new filesystem:
411  */
412 void bch2_btree_set_root_initial(struct bch_fs *c, struct btree *b,
413                                 struct btree_reserve *btree_reserve)
414 {
415         BUG_ON(btree_node_root(c, b));
416
417         bch2_btree_set_root_inmem(c, b, btree_reserve);
418         bch2_btree_set_root_ondisk(c, b);
419 }
420
421 /**
422  * bch_btree_set_root - update the root in memory and on disk
423  *
424  * To ensure forward progress, the current task must not be holding any
425  * btree node write locks. However, you must hold an intent lock on the
426  * old root.
427  *
428  * Note: This allocates a journal entry but doesn't add any keys to
429  * it.  All the btree roots are part of every journal write, so there
430  * is nothing new to be done.  This just guarantees that there is a
431  * journal write.
432  */
433 static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b,
434                                struct btree_interior_update *as,
435                                struct btree_reserve *btree_reserve)
436 {
437         struct bch_fs *c = iter->c;
438         struct btree *old;
439
440         trace_btree_set_root(c, b);
441         BUG_ON(!b->written);
442
443         old = btree_node_root(c, b);
444
445         /*
446          * Ensure no one is using the old root while we switch to the
447          * new root:
448          */
449         bch2_btree_node_lock_write(old, iter);
450
451         bch2_btree_set_root_inmem(c, b, btree_reserve);
452
453         btree_interior_update_updated_root(c, as, iter->btree_id);
454
455         /*
456          * Unlock old root after new root is visible:
457          *
458          * The new root isn't persistent, but that's ok: we still have
459          * an intent lock on the new root, and any updates that would
460          * depend on the new root would have to update the new root.
461          */
462         bch2_btree_node_unlock_write(old, iter);
463 }
464
465 static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level,
466                                         enum btree_id id,
467                                         struct btree_reserve *reserve)
468 {
469         struct btree *b = bch2_btree_node_alloc(c, level, id, reserve);
470
471         b->data->min_key = POS_MIN;
472         b->data->max_key = POS_MAX;
473         b->data->format = bch2_btree_calc_format(b);
474         b->key.k.p = POS_MAX;
475
476         btree_node_set_format(b, b->data->format);
477         bch2_btree_build_aux_trees(b);
478
479         six_unlock_write(&b->lock);
480
481         return b;
482 }
483
484 void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve)
485 {
486         bch2_disk_reservation_put(c, &reserve->disk_res);
487
488         mutex_lock(&c->btree_reserve_cache_lock);
489
490         while (reserve->nr) {
491                 struct btree *b = reserve->b[--reserve->nr];
492
493                 six_unlock_write(&b->lock);
494
495                 if (c->btree_reserve_cache_nr <
496                     ARRAY_SIZE(c->btree_reserve_cache)) {
497                         struct btree_alloc *a =
498                                 &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
499
500                         a->ob = b->ob;
501                         b->ob = NULL;
502                         bkey_copy(&a->k, &b->key);
503                 } else {
504                         bch2_open_bucket_put(c, b->ob);
505                         b->ob = NULL;
506                 }
507
508                 __btree_node_free(c, b, NULL);
509
510                 six_unlock_intent(&b->lock);
511         }
512
513         mutex_unlock(&c->btree_reserve_cache_lock);
514
515         mempool_free(reserve, &c->btree_reserve_pool);
516 }
517
518 static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c,
519                                                      unsigned nr_nodes,
520                                                      unsigned flags,
521                                                      struct closure *cl)
522 {
523         struct btree_reserve *reserve;
524         struct btree *b;
525         struct disk_reservation disk_res = { 0, 0 };
526         unsigned sectors = nr_nodes * c->sb.btree_node_size;
527         int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD|
528                 BCH_DISK_RESERVATION_METADATA;
529
530         if (flags & BTREE_INSERT_NOFAIL)
531                 disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
532
533         /*
534          * This check isn't necessary for correctness - it's just to potentially
535          * prevent us from doing a lot of work that'll end up being wasted:
536          */
537         ret = bch2_journal_error(&c->journal);
538         if (ret)
539                 return ERR_PTR(ret);
540
541         if (bch2_disk_reservation_get(c, &disk_res, sectors, disk_res_flags))
542                 return ERR_PTR(-ENOSPC);
543
544         BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
545
546         /*
547          * Protects reaping from the btree node cache and using the btree node
548          * open bucket reserve:
549          */
550         ret = bch2_btree_node_cannibalize_lock(c, cl);
551         if (ret) {
552                 bch2_disk_reservation_put(c, &disk_res);
553                 return ERR_PTR(ret);
554         }
555
556         reserve = mempool_alloc(&c->btree_reserve_pool, GFP_NOIO);
557
558         reserve->disk_res = disk_res;
559         reserve->nr = 0;
560
561         while (reserve->nr < nr_nodes) {
562                 b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE,
563                                            &disk_res, cl);
564                 if (IS_ERR(b)) {
565                         ret = PTR_ERR(b);
566                         goto err_free;
567                 }
568
569                 reserve->b[reserve->nr++] = b;
570         }
571
572         bch2_btree_node_cannibalize_unlock(c);
573         return reserve;
574 err_free:
575         bch2_btree_reserve_put(c, reserve);
576         bch2_btree_node_cannibalize_unlock(c);
577         trace_btree_reserve_get_fail(c, nr_nodes, cl);
578         return ERR_PTR(ret);
579 }
580
581 struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
582                                             struct btree *b,
583                                             unsigned extra_nodes,
584                                             unsigned flags,
585                                             struct closure *cl)
586 {
587         unsigned depth = btree_node_root(c, b)->level - b->level;
588         unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
589
590         return __bch2_btree_reserve_get(c, nr_nodes, flags, cl);
591
592 }
593
594 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
595                          struct closure *writes)
596 {
597         struct closure cl;
598         struct btree_reserve *reserve;
599         struct btree *b;
600
601         closure_init_stack(&cl);
602
603         while (1) {
604                 /* XXX haven't calculated capacity yet :/ */
605                 reserve = __bch2_btree_reserve_get(c, 1, 0, &cl);
606                 if (!IS_ERR(reserve))
607                         break;
608
609                 if (PTR_ERR(reserve) == -ENOSPC)
610                         return PTR_ERR(reserve);
611
612                 closure_sync(&cl);
613         }
614
615         b = __btree_root_alloc(c, 0, id, reserve);
616
617         bch2_btree_node_write(c, b, writes, SIX_LOCK_intent, -1);
618
619         bch2_btree_set_root_initial(c, b, reserve);
620         bch2_btree_open_bucket_put(c, b);
621         six_unlock_intent(&b->lock);
622
623         bch2_btree_reserve_put(c, reserve);
624
625         return 0;
626 }
627
628 static void bch2_insert_fixup_btree_ptr(struct btree_iter *iter,
629                                        struct btree *b,
630                                        struct bkey_i *insert,
631                                        struct btree_node_iter *node_iter,
632                                        struct disk_reservation *disk_res)
633 {
634         struct bch_fs *c = iter->c;
635         struct bch_fs_usage stats = { 0 };
636         struct bkey_packed *k;
637         struct bkey tmp;
638
639         if (bkey_extent_is_data(&insert->k))
640                 bch2_mark_key(c, bkey_i_to_s_c(insert),
641                              c->sb.btree_node_size, true,
642                              gc_pos_btree_node(b), &stats, 0);
643
644         while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
645                !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
646                 bch2_btree_node_iter_advance(node_iter, b);
647
648         /*
649          * If we're overwriting, look up pending delete and mark so that gc
650          * marks it on the pending delete list:
651          */
652         if (k && !bkey_cmp_packed(b, k, &insert->k))
653                 bch2_btree_node_free_index(c, b, iter->btree_id,
654                                           bkey_disassemble(b, k, &tmp),
655                                           &stats);
656
657         bch2_fs_usage_apply(c, &stats, disk_res, gc_pos_btree_node(b));
658
659         bch2_btree_bset_insert_key(iter, b, node_iter, insert);
660         set_btree_node_dirty(b);
661 }
662
663 /* Inserting into a given leaf node (last stage of insert): */
664
665 /* Handle overwrites and do insert, for non extents: */
666 bool bch2_btree_bset_insert_key(struct btree_iter *iter,
667                                struct btree *b,
668                                struct btree_node_iter *node_iter,
669                                struct bkey_i *insert)
670 {
671         const struct bkey_format *f = &b->format;
672         struct bkey_packed *k;
673         struct bset_tree *t;
674         unsigned clobber_u64s;
675
676         EBUG_ON(btree_node_just_written(b));
677         EBUG_ON(bset_written(b, btree_bset_last(b)));
678         EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
679         EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
680                 bkey_cmp(insert->k.p, b->data->max_key) > 0);
681         BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b));
682
683         k = bch2_btree_node_iter_peek_all(node_iter, b);
684         if (k && !bkey_cmp_packed(b, k, &insert->k)) {
685                 BUG_ON(bkey_whiteout(k));
686
687                 t = bch2_bkey_to_bset(b, k);
688
689                 if (bset_unwritten(b, bset(b, t)) &&
690                     bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
691                         BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
692
693                         k->type = insert->k.type;
694                         memcpy_u64s(bkeyp_val(f, k), &insert->v,
695                                     bkey_val_u64s(&insert->k));
696                         return true;
697                 }
698
699                 insert->k.needs_whiteout = k->needs_whiteout;
700
701                 btree_keys_account_key_drop(&b->nr, t - b->set, k);
702
703                 if (t == bset_tree_last(b)) {
704                         clobber_u64s = k->u64s;
705
706                         /*
707                          * If we're deleting, and the key we're deleting doesn't
708                          * need a whiteout (it wasn't overwriting a key that had
709                          * been written to disk) - just delete it:
710                          */
711                         if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
712                                 bch2_bset_delete(b, k, clobber_u64s);
713                                 bch2_btree_node_iter_fix(iter, b, node_iter, t,
714                                                         k, clobber_u64s, 0);
715                                 return true;
716                         }
717
718                         goto overwrite;
719                 }
720
721                 k->type = KEY_TYPE_DELETED;
722                 bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
723                                         k->u64s, k->u64s);
724
725                 if (bkey_whiteout(&insert->k)) {
726                         reserve_whiteout(b, t, k);
727                         return true;
728                 } else {
729                         k->needs_whiteout = false;
730                 }
731         } else {
732                 /*
733                  * Deleting, but the key to delete wasn't found - nothing to do:
734                  */
735                 if (bkey_whiteout(&insert->k))
736                         return false;
737
738                 insert->k.needs_whiteout = false;
739         }
740
741         t = bset_tree_last(b);
742         k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
743         clobber_u64s = 0;
744 overwrite:
745         bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
746         if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
747                 bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
748                                         clobber_u64s, k->u64s);
749         return true;
750 }
751
752 static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
753                                unsigned i)
754 {
755         struct bch_fs *c = container_of(j, struct bch_fs, journal);
756         struct btree_write *w = container_of(pin, struct btree_write, journal);
757         struct btree *b = container_of(w, struct btree, writes[i]);
758
759         six_lock_read(&b->lock);
760         /*
761          * Reusing a btree node can race with the journal reclaim code calling
762          * the journal pin flush fn, and there's no good fix for this: we don't
763          * really want journal_pin_drop() to block until the flush fn is no
764          * longer running, because journal_pin_drop() is called from the btree
765          * node write endio function, and we can't wait on the flush fn to
766          * finish running in mca_reap() - where we make reused btree nodes ready
767          * to use again - because there, we're holding the lock this function
768          * needs - deadlock.
769          *
770          * So, the b->level check is a hack so we don't try to write nodes we
771          * shouldn't:
772          */
773         if (!b->level)
774                 bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, i);
775         six_unlock_read(&b->lock);
776 }
777
778 static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin)
779 {
780         return __btree_node_flush(j, pin, 0);
781 }
782
783 static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin)
784 {
785         return __btree_node_flush(j, pin, 1);
786 }
787
788 void bch2_btree_journal_key(struct btree_insert *trans,
789                            struct btree_iter *iter,
790                            struct bkey_i *insert)
791 {
792         struct bch_fs *c = trans->c;
793         struct journal *j = &c->journal;
794         struct btree *b = iter->nodes[0];
795         struct btree_write *w = btree_current_write(b);
796
797         EBUG_ON(iter->level || b->level);
798         EBUG_ON(!trans->journal_res.ref &&
799                 test_bit(JOURNAL_REPLAY_DONE, &j->flags));
800
801         if (!journal_pin_active(&w->journal))
802                 bch2_journal_pin_add(j, &w->journal,
803                                     btree_node_write_idx(b) == 0
804                                     ? btree_node_flush0
805                                     : btree_node_flush1);
806
807         if (trans->journal_res.ref) {
808                 u64 seq = trans->journal_res.seq;
809                 bool needs_whiteout = insert->k.needs_whiteout;
810
811                 /*
812                  * have a bug where we're seeing an extent with an invalid crc
813                  * entry in the journal, trying to track it down:
814                  */
815                 BUG_ON(bch2_bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
816
817                 /* ick */
818                 insert->k.needs_whiteout = false;
819                 bch2_journal_add_keys(j, &trans->journal_res,
820                                      b->btree_id, insert);
821                 insert->k.needs_whiteout = needs_whiteout;
822
823                 if (trans->journal_seq)
824                         *trans->journal_seq = seq;
825                 btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
826         }
827
828         if (!btree_node_dirty(b))
829                 set_btree_node_dirty(b);
830 }
831
832 static enum btree_insert_ret
833 bch2_insert_fixup_key(struct btree_insert *trans,
834                      struct btree_insert_entry *insert)
835 {
836         struct btree_iter *iter = insert->iter;
837
838         BUG_ON(iter->level);
839
840         if (bch2_btree_bset_insert_key(iter,
841                                       iter->nodes[0],
842                                       &iter->node_iters[0],
843                                       insert->k))
844                 bch2_btree_journal_key(trans, iter, insert->k);
845
846         trans->did_work = true;
847         return BTREE_INSERT_OK;
848 }
849
850 static void verify_keys_sorted(struct keylist *l)
851 {
852 #ifdef CONFIG_BCACHEFS_DEBUG
853         struct bkey_i *k;
854
855         for_each_keylist_key(l, k)
856                 BUG_ON(bkey_next(k) != l->top &&
857                        bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
858 #endif
859 }
860
861 static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter)
862 {
863         struct bch_fs *c = iter->c;
864
865         bch2_btree_node_lock_write(b, iter);
866
867         if (btree_node_just_written(b) &&
868             bch2_btree_post_write_cleanup(c, b))
869                 bch2_btree_iter_reinit_node(iter, b);
870
871         /*
872          * If the last bset has been written, or if it's gotten too big - start
873          * a new bset to insert into:
874          */
875         if (want_new_bset(c, b))
876                 bch2_btree_init_next(c, b, iter);
877 }
878
879 /* Asynchronous interior node update machinery */
880
881 struct btree_interior_update *
882 bch2_btree_interior_update_alloc(struct bch_fs *c)
883 {
884         struct btree_interior_update *as;
885
886         as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
887         memset(as, 0, sizeof(*as));
888         closure_init(&as->cl, &c->cl);
889         as->c           = c;
890         as->mode        = BTREE_INTERIOR_NO_UPDATE;
891
892         bch2_keylist_init(&as->parent_keys, as->inline_keys,
893                          ARRAY_SIZE(as->inline_keys));
894
895         mutex_lock(&c->btree_interior_update_lock);
896         list_add(&as->list, &c->btree_interior_update_list);
897         mutex_unlock(&c->btree_interior_update_lock);
898
899         return as;
900 }
901
902 static void btree_interior_update_free(struct closure *cl)
903 {
904         struct btree_interior_update *as = container_of(cl, struct btree_interior_update, cl);
905
906         mempool_free(as, &as->c->btree_interior_update_pool);
907 }
908
909 static void btree_interior_update_nodes_reachable(struct closure *cl)
910 {
911         struct btree_interior_update *as =
912                 container_of(cl, struct btree_interior_update, cl);
913         struct bch_fs *c = as->c;
914         unsigned i;
915
916         bch2_journal_pin_drop(&c->journal, &as->journal);
917
918         mutex_lock(&c->btree_interior_update_lock);
919
920         for (i = 0; i < as->nr_pending; i++)
921                 bch2_btree_node_free_ondisk(c, &as->pending[i]);
922         as->nr_pending = 0;
923
924         mutex_unlock(&c->btree_interior_update_lock);
925
926         mutex_lock(&c->btree_interior_update_lock);
927         list_del(&as->list);
928         mutex_unlock(&c->btree_interior_update_lock);
929
930         closure_wake_up(&as->wait);
931
932         closure_return_with_destructor(cl, btree_interior_update_free);
933 }
934
935 static void btree_interior_update_nodes_written(struct closure *cl)
936 {
937         struct btree_interior_update *as =
938                 container_of(cl, struct btree_interior_update, cl);
939         struct bch_fs *c = as->c;
940         struct btree *b;
941
942         if (bch2_journal_error(&c->journal)) {
943                 /* XXX what? */
944         }
945
946         /* XXX: missing error handling, damnit */
947
948         /* check for journal error, bail out if we flushed */
949
950         /*
951          * We did an update to a parent node where the pointers we added pointed
952          * to child nodes that weren't written yet: now, the child nodes have
953          * been written so we can write out the update to the interior node.
954          */
955 retry:
956         mutex_lock(&c->btree_interior_update_lock);
957         switch (as->mode) {
958         case BTREE_INTERIOR_NO_UPDATE:
959                 BUG();
960         case BTREE_INTERIOR_UPDATING_NODE:
961                 /* The usual case: */
962                 b = READ_ONCE(as->b);
963
964                 if (!six_trylock_read(&b->lock)) {
965                         mutex_unlock(&c->btree_interior_update_lock);
966                         six_lock_read(&b->lock);
967                         six_unlock_read(&b->lock);
968                         goto retry;
969                 }
970
971                 BUG_ON(!btree_node_dirty(b));
972                 closure_wait(&btree_current_write(b)->wait, cl);
973
974                 list_del(&as->write_blocked_list);
975
976                 if (list_empty(&b->write_blocked))
977                         bch2_btree_node_write(c, b, NULL, SIX_LOCK_read, -1);
978                 six_unlock_read(&b->lock);
979                 break;
980
981         case BTREE_INTERIOR_UPDATING_AS:
982                 /*
983                  * The btree node we originally updated has been freed and is
984                  * being rewritten - so we need to write anything here, we just
985                  * need to signal to that btree_interior_update that it's ok to make the
986                  * new replacement node visible:
987                  */
988                 closure_put(&as->parent_as->cl);
989
990                 /*
991                  * and then we have to wait on that btree_interior_update to finish:
992                  */
993                 closure_wait(&as->parent_as->wait, cl);
994                 break;
995
996         case BTREE_INTERIOR_UPDATING_ROOT:
997                 /* b is the new btree root: */
998                 b = READ_ONCE(as->b);
999
1000                 if (!six_trylock_read(&b->lock)) {
1001                         mutex_unlock(&c->btree_interior_update_lock);
1002                         six_lock_read(&b->lock);
1003                         six_unlock_read(&b->lock);
1004                         goto retry;
1005                 }
1006
1007                 BUG_ON(c->btree_roots[b->btree_id].as != as);
1008                 c->btree_roots[b->btree_id].as = NULL;
1009
1010                 bch2_btree_set_root_ondisk(c, b);
1011
1012                 /*
1013                  * We don't have to wait anything anything here (before
1014                  * btree_interior_update_nodes_reachable frees the old nodes
1015                  * ondisk) - we've ensured that the very next journal write will
1016                  * have the pointer to the new root, and before the allocator
1017                  * can reuse the old nodes it'll have to do a journal commit:
1018                  */
1019                 six_unlock_read(&b->lock);
1020         }
1021         mutex_unlock(&c->btree_interior_update_lock);
1022
1023         continue_at(cl, btree_interior_update_nodes_reachable, system_wq);
1024 }
1025
1026 /*
1027  * We're updating @b with pointers to nodes that haven't finished writing yet:
1028  * block @b from being written until @as completes
1029  */
1030 static void btree_interior_update_updated_btree(struct bch_fs *c,
1031                                                 struct btree_interior_update *as,
1032                                                 struct btree *b)
1033 {
1034         mutex_lock(&c->btree_interior_update_lock);
1035
1036         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
1037         BUG_ON(!btree_node_dirty(b));
1038
1039         as->mode = BTREE_INTERIOR_UPDATING_NODE;
1040         as->b = b;
1041         list_add(&as->write_blocked_list, &b->write_blocked);
1042
1043         mutex_unlock(&c->btree_interior_update_lock);
1044
1045         bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
1046
1047         continue_at(&as->cl, btree_interior_update_nodes_written,
1048                     system_freezable_wq);
1049 }
1050
1051 static void btree_interior_update_updated_root(struct bch_fs *c,
1052                                                struct btree_interior_update *as,
1053                                                enum btree_id btree_id)
1054 {
1055         struct btree_root *r = &c->btree_roots[btree_id];
1056
1057         mutex_lock(&c->btree_interior_update_lock);
1058
1059         BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
1060
1061         /*
1062          * Old root might not be persistent yet - if so, redirect its
1063          * btree_interior_update operation to point to us:
1064          */
1065         if (r->as) {
1066                 BUG_ON(r->as->mode != BTREE_INTERIOR_UPDATING_ROOT);
1067
1068                 r->as->b = NULL;
1069                 r->as->mode = BTREE_INTERIOR_UPDATING_AS;
1070                 r->as->parent_as = as;
1071                 closure_get(&as->cl);
1072         }
1073
1074         as->mode = BTREE_INTERIOR_UPDATING_ROOT;
1075         as->b = r->b;
1076         r->as = as;
1077
1078         mutex_unlock(&c->btree_interior_update_lock);
1079
1080         bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
1081
1082         continue_at(&as->cl, btree_interior_update_nodes_written,
1083                     system_freezable_wq);
1084 }
1085
1086 static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin)
1087 {
1088         struct btree_interior_update *as =
1089                 container_of(pin, struct btree_interior_update, journal);
1090
1091         bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
1092 }
1093
1094 /*
1095  * @b is being split/rewritten: it may have pointers to not-yet-written btree
1096  * nodes and thus outstanding btree_interior_updates - redirect @b's
1097  * btree_interior_updates to point to this btree_interior_update:
1098  */
1099 void bch2_btree_interior_update_will_free_node(struct bch_fs *c,
1100                                               struct btree_interior_update *as,
1101                                               struct btree *b)
1102 {
1103         struct btree_interior_update *p, *n;
1104         struct pending_btree_node_free *d;
1105         struct bset_tree *t;
1106
1107         /*
1108          * Does this node have data that hasn't been written in the journal?
1109          *
1110          * If so, we have to wait for the corresponding journal entry to be
1111          * written before making the new nodes reachable - we can't just carry
1112          * over the bset->journal_seq tracking, since we'll be mixing those keys
1113          * in with keys that aren't in the journal anymore:
1114          */
1115         for_each_bset(b, t)
1116                 as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
1117
1118         /*
1119          * Does this node have unwritten data that has a pin on the journal?
1120          *
1121          * If so, transfer that pin to the btree_interior_update operation -
1122          * note that if we're freeing multiple nodes, we only need to keep the
1123          * oldest pin of any of the nodes we're freeing. We'll release the pin
1124          * when the new nodes are persistent and reachable on disk:
1125          */
1126         bch2_journal_pin_add_if_older(&c->journal,
1127                                      &b->writes[0].journal,
1128                                      &as->journal, interior_update_flush);
1129         bch2_journal_pin_add_if_older(&c->journal,
1130                                      &b->writes[1].journal,
1131                                      &as->journal, interior_update_flush);
1132
1133         mutex_lock(&c->btree_interior_update_lock);
1134
1135         /*
1136          * Does this node have any btree_interior_update operations preventing
1137          * it from being written?
1138          *
1139          * If so, redirect them to point to this btree_interior_update: we can
1140          * write out our new nodes, but we won't make them visible until those
1141          * operations complete
1142          */
1143         list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
1144                 BUG_ON(p->mode != BTREE_INTERIOR_UPDATING_NODE);
1145
1146                 p->mode = BTREE_INTERIOR_UPDATING_AS;
1147                 list_del(&p->write_blocked_list);
1148                 p->b = NULL;
1149                 p->parent_as = as;
1150                 closure_get(&as->cl);
1151         }
1152
1153         /* Add this node to the list of nodes being freed: */
1154         BUG_ON(as->nr_pending >= ARRAY_SIZE(as->pending));
1155
1156         d = &as->pending[as->nr_pending++];
1157         d->index_update_done    = false;
1158         d->seq                  = b->data->keys.seq;
1159         d->btree_id             = b->btree_id;
1160         d->level                = b->level;
1161         bkey_copy(&d->key, &b->key);
1162
1163         mutex_unlock(&c->btree_interior_update_lock);
1164 }
1165
1166 static void btree_node_interior_verify(struct btree *b)
1167 {
1168         struct btree_node_iter iter;
1169         struct bkey_packed *k;
1170
1171         BUG_ON(!b->level);
1172
1173         bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
1174 #if 1
1175         BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
1176                bkey_cmp_left_packed(b, k, &b->key.k.p));
1177
1178         BUG_ON((bch2_btree_node_iter_advance(&iter, b),
1179                 !bch2_btree_node_iter_end(&iter)));
1180 #else
1181         const char *msg;
1182
1183         msg = "not found";
1184         k = bch2_btree_node_iter_peek(&iter, b);
1185         if (!k)
1186                 goto err;
1187
1188         msg = "isn't what it should be";
1189         if (bkey_cmp_left_packed(b, k, &b->key.k.p))
1190                 goto err;
1191
1192         bch2_btree_node_iter_advance(&iter, b);
1193
1194         msg = "isn't last key";
1195         if (!bch2_btree_node_iter_end(&iter))
1196                 goto err;
1197         return;
1198 err:
1199         bch2_dump_btree_node(b);
1200         printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode,
1201                b->key.k.p.offset, msg);
1202         BUG();
1203 #endif
1204 }
1205
1206 static enum btree_insert_ret
1207 bch2_btree_insert_keys_interior(struct btree *b,
1208                                struct btree_iter *iter,
1209                                struct keylist *insert_keys,
1210                                struct btree_interior_update *as,
1211                                struct btree_reserve *res)
1212 {
1213         struct bch_fs *c = iter->c;
1214         struct btree_iter *linked;
1215         struct btree_node_iter node_iter;
1216         struct bkey_i *insert = bch2_keylist_front(insert_keys);
1217         struct bkey_packed *k;
1218
1219         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1220         BUG_ON(!b->level);
1221         BUG_ON(!as || as->b);
1222         verify_keys_sorted(insert_keys);
1223
1224         btree_node_lock_for_insert(b, iter);
1225
1226         if (bch_keylist_u64s(insert_keys) >
1227             bch_btree_keys_u64s_remaining(c, b)) {
1228                 bch2_btree_node_unlock_write(b, iter);
1229                 return BTREE_INSERT_BTREE_NODE_FULL;
1230         }
1231
1232         /* Don't screw up @iter's position: */
1233         node_iter = iter->node_iters[b->level];
1234
1235         /*
1236          * btree_split(), btree_gc_coalesce() will insert keys before
1237          * the iterator's current position - they know the keys go in
1238          * the node the iterator points to:
1239          */
1240         while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
1241                (bkey_cmp_packed(b, k, &insert->k) >= 0))
1242                 ;
1243
1244         while (!bch2_keylist_empty(insert_keys)) {
1245                 insert = bch2_keylist_front(insert_keys);
1246
1247                 bch2_insert_fixup_btree_ptr(iter, b, insert,
1248                                            &node_iter, &res->disk_res);
1249                 bch2_keylist_pop_front(insert_keys);
1250         }
1251
1252         btree_interior_update_updated_btree(c, as, b);
1253
1254         for_each_linked_btree_node(iter, b, linked)
1255                 bch2_btree_node_iter_peek(&linked->node_iters[b->level],
1256                                          b);
1257         bch2_btree_node_iter_peek(&iter->node_iters[b->level], b);
1258
1259         bch2_btree_iter_verify(iter, b);
1260
1261         if (bch2_maybe_compact_whiteouts(c, b))
1262                 bch2_btree_iter_reinit_node(iter, b);
1263
1264         bch2_btree_node_unlock_write(b, iter);
1265
1266         btree_node_interior_verify(b);
1267         return BTREE_INSERT_OK;
1268 }
1269
1270 /*
1271  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
1272  * node)
1273  */
1274 static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
1275                                         struct btree_reserve *reserve)
1276 {
1277         size_t nr_packed = 0, nr_unpacked = 0;
1278         struct btree *n2;
1279         struct bset *set1, *set2;
1280         struct bkey_packed *k, *prev = NULL;
1281
1282         n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
1283         n2->data->max_key       = n1->data->max_key;
1284         n2->data->format        = n1->format;
1285         n2->key.k.p = n1->key.k.p;
1286
1287         btree_node_set_format(n2, n2->data->format);
1288
1289         set1 = btree_bset_first(n1);
1290         set2 = btree_bset_first(n2);
1291
1292         /*
1293          * Has to be a linear search because we don't have an auxiliary
1294          * search tree yet
1295          */
1296         k = set1->start;
1297         while (1) {
1298                 if (bkey_next(k) == vstruct_last(set1))
1299                         break;
1300                 if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
1301                         break;
1302
1303                 if (bkey_packed(k))
1304                         nr_packed++;
1305                 else
1306                         nr_unpacked++;
1307
1308                 prev = k;
1309                 k = bkey_next(k);
1310         }
1311
1312         BUG_ON(!prev);
1313
1314         n1->key.k.p = bkey_unpack_pos(n1, prev);
1315         n1->data->max_key = n1->key.k.p;
1316         n2->data->min_key =
1317                 btree_type_successor(n1->btree_id, n1->key.k.p);
1318
1319         set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
1320         set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
1321
1322         set_btree_bset_end(n1, n1->set);
1323         set_btree_bset_end(n2, n2->set);
1324
1325         n2->nr.live_u64s        = le16_to_cpu(set2->u64s);
1326         n2->nr.bset_u64s[0]     = le16_to_cpu(set2->u64s);
1327         n2->nr.packed_keys      = n1->nr.packed_keys - nr_packed;
1328         n2->nr.unpacked_keys    = n1->nr.unpacked_keys - nr_unpacked;
1329
1330         n1->nr.live_u64s        = le16_to_cpu(set1->u64s);
1331         n1->nr.bset_u64s[0]     = le16_to_cpu(set1->u64s);
1332         n1->nr.packed_keys      = nr_packed;
1333         n1->nr.unpacked_keys    = nr_unpacked;
1334
1335         BUG_ON(!set1->u64s);
1336         BUG_ON(!set2->u64s);
1337
1338         memcpy_u64s(set2->start,
1339                     vstruct_end(set1),
1340                     le16_to_cpu(set2->u64s));
1341
1342         btree_node_reset_sib_u64s(n1);
1343         btree_node_reset_sib_u64s(n2);
1344
1345         bch2_verify_btree_nr_keys(n1);
1346         bch2_verify_btree_nr_keys(n2);
1347
1348         if (n1->level) {
1349                 btree_node_interior_verify(n1);
1350                 btree_node_interior_verify(n2);
1351         }
1352
1353         return n2;
1354 }
1355
1356 /*
1357  * For updates to interior nodes, we've got to do the insert before we split
1358  * because the stuff we're inserting has to be inserted atomically. Post split,
1359  * the keys might have to go in different nodes and the split would no longer be
1360  * atomic.
1361  *
1362  * Worse, if the insert is from btree node coalescing, if we do the insert after
1363  * we do the split (and pick the pivot) - the pivot we pick might be between
1364  * nodes that were coalesced, and thus in the middle of a child node post
1365  * coalescing:
1366  */
1367 static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b,
1368                                     struct keylist *keys,
1369                                     struct btree_reserve *res)
1370 {
1371         struct btree_node_iter node_iter;
1372         struct bkey_i *k = bch2_keylist_front(keys);
1373         struct bkey_packed *p;
1374         struct bset *i;
1375
1376         BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
1377
1378         bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
1379
1380         while (!bch2_keylist_empty(keys)) {
1381                 k = bch2_keylist_front(keys);
1382
1383                 BUG_ON(bch_keylist_u64s(keys) >
1384                        bch_btree_keys_u64s_remaining(iter->c, b));
1385                 BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0);
1386                 BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0);
1387
1388                 bch2_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res);
1389                 bch2_keylist_pop_front(keys);
1390         }
1391
1392         /*
1393          * We can't tolerate whiteouts here - with whiteouts there can be
1394          * duplicate keys, and it would be rather bad if we picked a duplicate
1395          * for the pivot:
1396          */
1397         i = btree_bset_first(b);
1398         p = i->start;
1399         while (p != vstruct_last(i))
1400                 if (bkey_deleted(p)) {
1401                         le16_add_cpu(&i->u64s, -p->u64s);
1402                         set_btree_bset_end(b, b->set);
1403                         memmove_u64s_down(p, bkey_next(p),
1404                                           (u64 *) vstruct_last(i) -
1405                                           (u64 *) p);
1406                 } else
1407                         p = bkey_next(p);
1408
1409         BUG_ON(b->nsets != 1 ||
1410                b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
1411
1412         btree_node_interior_verify(b);
1413 }
1414
1415 static void btree_split(struct btree *b, struct btree_iter *iter,
1416                         struct keylist *insert_keys,
1417                         struct btree_reserve *reserve,
1418                         struct btree_interior_update *as)
1419 {
1420         struct bch_fs *c = iter->c;
1421         struct btree *parent = iter->nodes[b->level + 1];
1422         struct btree *n1, *n2 = NULL, *n3 = NULL;
1423         u64 start_time = local_clock();
1424
1425         BUG_ON(!parent && (b != btree_node_root(c, b)));
1426         BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level));
1427
1428         bch2_btree_interior_update_will_free_node(c, as, b);
1429
1430         n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
1431         if (b->level)
1432                 btree_split_insert_keys(iter, n1, insert_keys, reserve);
1433
1434         if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
1435                 trace_btree_node_split(c, b, b->nr.live_u64s);
1436
1437                 n2 = __btree_split_node(iter, n1, reserve);
1438
1439                 bch2_btree_build_aux_trees(n2);
1440                 bch2_btree_build_aux_trees(n1);
1441                 six_unlock_write(&n2->lock);
1442                 six_unlock_write(&n1->lock);
1443
1444                 bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent, -1);
1445
1446                 /*
1447                  * Note that on recursive parent_keys == insert_keys, so we
1448                  * can't start adding new keys to parent_keys before emptying it
1449                  * out (which we did with btree_split_insert_keys() above)
1450                  */
1451                 bch2_keylist_add(&as->parent_keys, &n1->key);
1452                 bch2_keylist_add(&as->parent_keys, &n2->key);
1453
1454                 if (!parent) {
1455                         /* Depth increases, make a new root */
1456                         n3 = __btree_root_alloc(c, b->level + 1,
1457                                                 iter->btree_id,
1458                                                 reserve);
1459                         n3->sib_u64s[0] = U16_MAX;
1460                         n3->sib_u64s[1] = U16_MAX;
1461
1462                         btree_split_insert_keys(iter, n3, &as->parent_keys,
1463                                                 reserve);
1464                         bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent, -1);
1465                 }
1466         } else {
1467                 trace_btree_node_compact(c, b, b->nr.live_u64s);
1468
1469                 bch2_btree_build_aux_trees(n1);
1470                 six_unlock_write(&n1->lock);
1471
1472                 bch2_keylist_add(&as->parent_keys, &n1->key);
1473         }
1474
1475         bch2_btree_node_write(c, n1, &as->cl, SIX_LOCK_intent, -1);
1476
1477         /* New nodes all written, now make them visible: */
1478
1479         if (parent) {
1480                 /* Split a non root node */
1481                 bch2_btree_insert_node(parent, iter, &as->parent_keys,
1482                                       reserve, as);
1483         } else if (n3) {
1484                 bch2_btree_set_root(iter, n3, as, reserve);
1485         } else {
1486                 /* Root filled up but didn't need to be split */
1487                 bch2_btree_set_root(iter, n1, as, reserve);
1488         }
1489
1490         bch2_btree_open_bucket_put(c, n1);
1491         if (n2)
1492                 bch2_btree_open_bucket_put(c, n2);
1493         if (n3)
1494                 bch2_btree_open_bucket_put(c, n3);
1495
1496         /*
1497          * Note - at this point other linked iterators could still have @b read
1498          * locked; we're depending on the bch2_btree_iter_node_replace() calls
1499          * below removing all references to @b so we don't return with other
1500          * iterators pointing to a node they have locked that's been freed.
1501          *
1502          * We have to free the node first because the bch2_iter_node_replace()
1503          * calls will drop _our_ iterator's reference - and intent lock - to @b.
1504          */
1505         bch2_btree_node_free_inmem(iter, b);
1506
1507         /* Successful split, update the iterator to point to the new nodes: */
1508
1509         if (n3)
1510                 bch2_btree_iter_node_replace(iter, n3);
1511         if (n2)
1512                 bch2_btree_iter_node_replace(iter, n2);
1513         bch2_btree_iter_node_replace(iter, n1);
1514
1515         bch2_time_stats_update(&c->btree_split_time, start_time);
1516 }
1517
1518 /**
1519  * bch_btree_insert_node - insert bkeys into a given btree node
1520  *
1521  * @iter:               btree iterator
1522  * @insert_keys:        list of keys to insert
1523  * @hook:               insert callback
1524  * @persistent:         if not null, @persistent will wait on journal write
1525  *
1526  * Inserts as many keys as it can into a given btree node, splitting it if full.
1527  * If a split occurred, this function will return early. This can only happen
1528  * for leaf nodes -- inserts into interior nodes have to be atomic.
1529  */
1530 void bch2_btree_insert_node(struct btree *b,
1531                            struct btree_iter *iter,
1532                            struct keylist *insert_keys,
1533                            struct btree_reserve *reserve,
1534                            struct btree_interior_update *as)
1535 {
1536         BUG_ON(!b->level);
1537         BUG_ON(!reserve || !as);
1538
1539         switch (bch2_btree_insert_keys_interior(b, iter, insert_keys,
1540                                                as, reserve)) {
1541         case BTREE_INSERT_OK:
1542                 break;
1543         case BTREE_INSERT_BTREE_NODE_FULL:
1544                 btree_split(b, iter, insert_keys, reserve, as);
1545                 break;
1546         default:
1547                 BUG();
1548         }
1549 }
1550
1551 static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags)
1552 {
1553         struct bch_fs *c = iter->c;
1554         struct btree *b = iter->nodes[0];
1555         struct btree_reserve *reserve;
1556         struct btree_interior_update *as;
1557         struct closure cl;
1558         int ret = 0;
1559
1560         closure_init_stack(&cl);
1561
1562         /* Hack, because gc and splitting nodes doesn't mix yet: */
1563         if (!down_read_trylock(&c->gc_lock)) {
1564                 bch2_btree_iter_unlock(iter);
1565                 down_read(&c->gc_lock);
1566         }
1567
1568         /*
1569          * XXX: figure out how far we might need to split,
1570          * instead of locking/reserving all the way to the root:
1571          */
1572         if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
1573                 ret = -EINTR;
1574                 goto out;
1575         }
1576
1577         reserve = bch2_btree_reserve_get(c, b, 0, flags, &cl);
1578         if (IS_ERR(reserve)) {
1579                 ret = PTR_ERR(reserve);
1580                 if (ret == -EAGAIN) {
1581                         bch2_btree_iter_unlock(iter);
1582                         up_read(&c->gc_lock);
1583                         closure_sync(&cl);
1584                         return -EINTR;
1585                 }
1586                 goto out;
1587         }
1588
1589         as = bch2_btree_interior_update_alloc(c);
1590
1591         btree_split(b, iter, NULL, reserve, as);
1592         bch2_btree_reserve_put(c, reserve);
1593
1594         bch2_btree_iter_set_locks_want(iter, 1);
1595 out:
1596         up_read(&c->gc_lock);
1597         return ret;
1598 }
1599
1600 enum btree_node_sibling {
1601         btree_prev_sib,
1602         btree_next_sib,
1603 };
1604
1605 static struct btree *btree_node_get_sibling(struct btree_iter *iter,
1606                                             struct btree *b,
1607                                             enum btree_node_sibling sib)
1608 {
1609         struct btree *parent;
1610         struct btree_node_iter node_iter;
1611         struct bkey_packed *k;
1612         BKEY_PADDED(k) tmp;
1613         struct btree *ret;
1614         unsigned level = b->level;
1615
1616         parent = iter->nodes[level + 1];
1617         if (!parent)
1618                 return NULL;
1619
1620         if (!bch2_btree_node_relock(iter, level + 1)) {
1621                 bch2_btree_iter_set_locks_want(iter, level + 2);
1622                 return ERR_PTR(-EINTR);
1623         }
1624
1625         node_iter = iter->node_iters[parent->level];
1626
1627         k = bch2_btree_node_iter_peek_all(&node_iter, parent);
1628         BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
1629
1630         do {
1631                 k = sib == btree_prev_sib
1632                         ? bch2_btree_node_iter_prev_all(&node_iter, parent)
1633                         : (bch2_btree_node_iter_advance(&node_iter, parent),
1634                            bch2_btree_node_iter_peek_all(&node_iter, parent));
1635                 if (!k)
1636                         return NULL;
1637         } while (bkey_deleted(k));
1638
1639         bch2_bkey_unpack(parent, &tmp.k, k);
1640
1641         ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
1642
1643         if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
1644                 btree_node_unlock(iter, level);
1645                 ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent);
1646         }
1647
1648         if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) {
1649                 six_unlock_intent(&ret->lock);
1650                 ret = ERR_PTR(-EINTR);
1651         }
1652
1653         return ret;
1654 }
1655
1656 static int __foreground_maybe_merge(struct btree_iter *iter,
1657                                     enum btree_node_sibling sib)
1658 {
1659         struct bch_fs *c = iter->c;
1660         struct btree_reserve *reserve;
1661         struct btree_interior_update *as;
1662         struct bkey_format_state new_s;
1663         struct bkey_format new_f;
1664         struct bkey_i delete;
1665         struct btree *b, *m, *n, *prev, *next, *parent;
1666         struct closure cl;
1667         size_t sib_u64s;
1668         int ret = 0;
1669
1670         closure_init_stack(&cl);
1671 retry:
1672         if (!bch2_btree_node_relock(iter, iter->level))
1673                 return 0;
1674
1675         b = iter->nodes[iter->level];
1676
1677         parent = iter->nodes[b->level + 1];
1678         if (!parent)
1679                 return 0;
1680
1681         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1682                 return 0;
1683
1684         /* XXX: can't be holding read locks */
1685         m = btree_node_get_sibling(iter, b, sib);
1686         if (IS_ERR(m)) {
1687                 ret = PTR_ERR(m);
1688                 goto out;
1689         }
1690
1691         /* NULL means no sibling: */
1692         if (!m) {
1693                 b->sib_u64s[sib] = U16_MAX;
1694                 return 0;
1695         }
1696
1697         if (sib == btree_prev_sib) {
1698                 prev = m;
1699                 next = b;
1700         } else {
1701                 prev = b;
1702                 next = m;
1703         }
1704
1705         bch2_bkey_format_init(&new_s);
1706         __bch2_btree_calc_format(&new_s, b);
1707         __bch2_btree_calc_format(&new_s, m);
1708         new_f = bch2_bkey_format_done(&new_s);
1709
1710         sib_u64s = btree_node_u64s_with_format(b, &new_f) +
1711                 btree_node_u64s_with_format(m, &new_f);
1712
1713         if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
1714                 sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1715                 sib_u64s /= 2;
1716                 sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
1717         }
1718
1719         sib_u64s = min(sib_u64s, btree_max_u64s(c));
1720         b->sib_u64s[sib] = sib_u64s;
1721
1722         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
1723                 six_unlock_intent(&m->lock);
1724                 return 0;
1725         }
1726
1727         /* We're changing btree topology, doesn't mix with gc: */
1728         if (!down_read_trylock(&c->gc_lock)) {
1729                 six_unlock_intent(&m->lock);
1730                 bch2_btree_iter_unlock(iter);
1731
1732                 down_read(&c->gc_lock);
1733                 up_read(&c->gc_lock);
1734                 ret = -EINTR;
1735                 goto out;
1736         }
1737
1738         if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
1739                 ret = -EINTR;
1740                 goto out_unlock;
1741         }
1742
1743         reserve = bch2_btree_reserve_get(c, b, 0,
1744                                         BTREE_INSERT_NOFAIL|
1745                                         BTREE_INSERT_USE_RESERVE,
1746                                         &cl);
1747         if (IS_ERR(reserve)) {
1748                 ret = PTR_ERR(reserve);
1749                 goto out_unlock;
1750         }
1751
1752         as = bch2_btree_interior_update_alloc(c);
1753
1754         bch2_btree_interior_update_will_free_node(c, as, b);
1755         bch2_btree_interior_update_will_free_node(c, as, m);
1756
1757         n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
1758         n->data->min_key        = prev->data->min_key;
1759         n->data->max_key        = next->data->max_key;
1760         n->data->format         = new_f;
1761         n->key.k.p              = next->key.k.p;
1762
1763         btree_node_set_format(n, new_f);
1764
1765         bch2_btree_sort_into(c, n, prev);
1766         bch2_btree_sort_into(c, n, next);
1767
1768         bch2_btree_build_aux_trees(n);
1769         six_unlock_write(&n->lock);
1770
1771         bkey_init(&delete.k);
1772         delete.k.p = prev->key.k.p;
1773         bch2_keylist_add(&as->parent_keys, &delete);
1774         bch2_keylist_add(&as->parent_keys, &n->key);
1775
1776         bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
1777
1778         bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as);
1779
1780         bch2_btree_open_bucket_put(c, n);
1781         bch2_btree_node_free_inmem(iter, b);
1782         bch2_btree_node_free_inmem(iter, m);
1783         bch2_btree_iter_node_replace(iter, n);
1784
1785         bch2_btree_iter_verify(iter, n);
1786
1787         bch2_btree_reserve_put(c, reserve);
1788 out_unlock:
1789         if (ret != -EINTR && ret != -EAGAIN)
1790                 bch2_btree_iter_set_locks_want(iter, 1);
1791         six_unlock_intent(&m->lock);
1792         up_read(&c->gc_lock);
1793 out:
1794         if (ret == -EAGAIN || ret == -EINTR) {
1795                 bch2_btree_iter_unlock(iter);
1796                 ret = -EINTR;
1797         }
1798
1799         closure_sync(&cl);
1800
1801         if (ret == -EINTR) {
1802                 ret = bch2_btree_iter_traverse(iter);
1803                 if (!ret)
1804                         goto retry;
1805         }
1806
1807         return ret;
1808 }
1809
1810 static int inline foreground_maybe_merge(struct btree_iter *iter,
1811                                          enum btree_node_sibling sib)
1812 {
1813         struct bch_fs *c = iter->c;
1814         struct btree *b;
1815
1816         if (!btree_node_locked(iter, iter->level))
1817                 return 0;
1818
1819         b = iter->nodes[iter->level];
1820         if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
1821                 return 0;
1822
1823         return __foreground_maybe_merge(iter, sib);
1824 }
1825
1826 /**
1827  * btree_insert_key - insert a key one key into a leaf node
1828  */
1829 static enum btree_insert_ret
1830 btree_insert_key(struct btree_insert *trans,
1831                  struct btree_insert_entry *insert)
1832 {
1833         struct bch_fs *c = trans->c;
1834         struct btree_iter *iter = insert->iter;
1835         struct btree *b = iter->nodes[0];
1836         enum btree_insert_ret ret;
1837         int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
1838         int old_live_u64s = b->nr.live_u64s;
1839         int live_u64s_added, u64s_added;
1840
1841         ret = !btree_node_is_extents(b)
1842                 ? bch2_insert_fixup_key(trans, insert)
1843                 : bch2_insert_fixup_extent(trans, insert);
1844
1845         live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
1846         u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
1847
1848         if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
1849                 b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
1850         if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
1851                 b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
1852
1853         if (u64s_added > live_u64s_added &&
1854             bch2_maybe_compact_whiteouts(iter->c, b))
1855                 bch2_btree_iter_reinit_node(iter, b);
1856
1857         trace_btree_insert_key(c, b, insert->k);
1858         return ret;
1859 }
1860
1861 static bool same_leaf_as_prev(struct btree_insert *trans,
1862                               struct btree_insert_entry *i)
1863 {
1864         /*
1865          * Because we sorted the transaction entries, if multiple iterators
1866          * point to the same leaf node they'll always be adjacent now:
1867          */
1868         return i != trans->entries &&
1869                 i[0].iter->nodes[0] == i[-1].iter->nodes[0];
1870 }
1871
1872 #define trans_for_each_entry(trans, i)                                  \
1873         for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
1874
1875 static void multi_lock_write(struct btree_insert *trans)
1876 {
1877         struct btree_insert_entry *i;
1878
1879         trans_for_each_entry(trans, i)
1880                 if (!same_leaf_as_prev(trans, i))
1881                         btree_node_lock_for_insert(i->iter->nodes[0], i->iter);
1882 }
1883
1884 static void multi_unlock_write(struct btree_insert *trans)
1885 {
1886         struct btree_insert_entry *i;
1887
1888         trans_for_each_entry(trans, i)
1889                 if (!same_leaf_as_prev(trans, i))
1890                         bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter);
1891 }
1892
1893 static int btree_trans_entry_cmp(const void *_l, const void *_r)
1894 {
1895         const struct btree_insert_entry *l = _l;
1896         const struct btree_insert_entry *r = _r;
1897
1898         return btree_iter_cmp(l->iter, r->iter);
1899 }
1900
1901 /* Normal update interface: */
1902
1903 /**
1904  * __bch_btree_insert_at - insert keys at given iterator positions
1905  *
1906  * This is main entry point for btree updates.
1907  *
1908  * Return values:
1909  * -EINTR: locking changed, this function should be called again. Only returned
1910  *  if passed BTREE_INSERT_ATOMIC.
1911  * -EROFS: filesystem read only
1912  * -EIO: journal or btree node IO error
1913  */
1914 int __bch2_btree_insert_at(struct btree_insert *trans)
1915 {
1916         struct bch_fs *c = trans->c;
1917         struct btree_insert_entry *i;
1918         struct btree_iter *split = NULL;
1919         bool cycle_gc_lock = false;
1920         unsigned u64s;
1921         int ret;
1922
1923         trans_for_each_entry(trans, i) {
1924                 EBUG_ON(i->iter->level);
1925                 EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
1926         }
1927
1928         sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
1929              btree_trans_entry_cmp, NULL);
1930
1931         if (unlikely(!percpu_ref_tryget(&c->writes)))
1932                 return -EROFS;
1933 retry_locks:
1934         ret = -EINTR;
1935         trans_for_each_entry(trans, i)
1936                 if (!bch2_btree_iter_set_locks_want(i->iter, 1))
1937                         goto err;
1938 retry:
1939         trans->did_work = false;
1940         u64s = 0;
1941         trans_for_each_entry(trans, i)
1942                 if (!i->done)
1943                         u64s += jset_u64s(i->k->k.u64s + i->extra_res);
1944
1945         memset(&trans->journal_res, 0, sizeof(trans->journal_res));
1946
1947         ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
1948                 ? bch2_journal_res_get(&c->journal,
1949                                       &trans->journal_res,
1950                                       u64s, u64s)
1951                 : 0;
1952         if (ret)
1953                 goto err;
1954
1955         multi_lock_write(trans);
1956
1957         u64s = 0;
1958         trans_for_each_entry(trans, i) {
1959                 /* Multiple inserts might go to same leaf: */
1960                 if (!same_leaf_as_prev(trans, i))
1961                         u64s = 0;
1962
1963                 /*
1964                  * bch2_btree_node_insert_fits() must be called under write lock:
1965                  * with only an intent lock, another thread can still call
1966                  * bch2_btree_node_write(), converting an unwritten bset to a
1967                  * written one
1968                  */
1969                 if (!i->done) {
1970                         u64s += i->k->k.u64s + i->extra_res;
1971                         if (!bch2_btree_node_insert_fits(c,
1972                                         i->iter->nodes[0], u64s)) {
1973                                 split = i->iter;
1974                                 goto unlock;
1975                         }
1976                 }
1977         }
1978
1979         ret = 0;
1980         split = NULL;
1981         cycle_gc_lock = false;
1982
1983         trans_for_each_entry(trans, i) {
1984                 if (i->done)
1985                         continue;
1986
1987                 switch (btree_insert_key(trans, i)) {
1988                 case BTREE_INSERT_OK:
1989                         i->done = true;
1990                         break;
1991                 case BTREE_INSERT_JOURNAL_RES_FULL:
1992                 case BTREE_INSERT_NEED_TRAVERSE:
1993                         ret = -EINTR;
1994                         break;
1995                 case BTREE_INSERT_NEED_RESCHED:
1996                         ret = -EAGAIN;
1997                         break;
1998                 case BTREE_INSERT_BTREE_NODE_FULL:
1999                         split = i->iter;
2000                         break;
2001                 case BTREE_INSERT_ENOSPC:
2002                         ret = -ENOSPC;
2003                         break;
2004                 case BTREE_INSERT_NEED_GC_LOCK:
2005                         cycle_gc_lock = true;
2006                         ret = -EINTR;
2007                         break;
2008                 default:
2009                         BUG();
2010                 }
2011
2012                 if (!trans->did_work && (ret || split))
2013                         break;
2014         }
2015 unlock:
2016         multi_unlock_write(trans);
2017         bch2_journal_res_put(&c->journal, &trans->journal_res);
2018
2019         if (split)
2020                 goto split;
2021         if (ret)
2022                 goto err;
2023
2024         /*
2025          * hack: iterators are inconsistent when they hit end of leaf, until
2026          * traversed again
2027          */
2028         trans_for_each_entry(trans, i)
2029                 if (i->iter->at_end_of_leaf)
2030                         goto out;
2031
2032         trans_for_each_entry(trans, i)
2033                 if (!same_leaf_as_prev(trans, i)) {
2034                         foreground_maybe_merge(i->iter, btree_prev_sib);
2035                         foreground_maybe_merge(i->iter, btree_next_sib);
2036                 }
2037 out:
2038         /* make sure we didn't lose an error: */
2039         if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
2040                 trans_for_each_entry(trans, i)
2041                         BUG_ON(!i->done);
2042
2043         percpu_ref_put(&c->writes);
2044         return ret;
2045 split:
2046         /*
2047          * have to drop journal res before splitting, because splitting means
2048          * allocating new btree nodes, and holding a journal reservation
2049          * potentially blocks the allocator:
2050          */
2051         ret = bch2_btree_split_leaf(split, trans->flags);
2052         if (ret)
2053                 goto err;
2054         /*
2055          * if the split didn't have to drop locks the insert will still be
2056          * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
2057          * and is overwriting won't have changed)
2058          */
2059         goto retry_locks;
2060 err:
2061         if (cycle_gc_lock) {
2062                 down_read(&c->gc_lock);
2063                 up_read(&c->gc_lock);
2064         }
2065
2066         if (ret == -EINTR) {
2067                 trans_for_each_entry(trans, i) {
2068                         int ret2 = bch2_btree_iter_traverse(i->iter);
2069                         if (ret2) {
2070                                 ret = ret2;
2071                                 goto out;
2072                         }
2073                 }
2074
2075                 /*
2076                  * BTREE_ITER_ATOMIC means we have to return -EINTR if we
2077                  * dropped locks:
2078                  */
2079                 if (!(trans->flags & BTREE_INSERT_ATOMIC))
2080                         goto retry;
2081         }
2082
2083         goto out;
2084 }
2085
2086 int bch2_btree_insert_list_at(struct btree_iter *iter,
2087                              struct keylist *keys,
2088                              struct disk_reservation *disk_res,
2089                              struct extent_insert_hook *hook,
2090                              u64 *journal_seq, unsigned flags)
2091 {
2092         BUG_ON(flags & BTREE_INSERT_ATOMIC);
2093         BUG_ON(bch2_keylist_empty(keys));
2094         verify_keys_sorted(keys);
2095
2096         while (!bch2_keylist_empty(keys)) {
2097                 /* need to traverse between each insert */
2098                 int ret = bch2_btree_iter_traverse(iter);
2099                 if (ret)
2100                         return ret;
2101
2102                 ret = bch2_btree_insert_at(iter->c, disk_res, hook,
2103                                 journal_seq, flags,
2104                                 BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
2105                 if (ret)
2106                         return ret;
2107
2108                 bch2_keylist_pop_front(keys);
2109         }
2110
2111         return 0;
2112 }
2113
2114 /**
2115  * bch_btree_insert_check_key - insert dummy key into btree
2116  *
2117  * We insert a random key on a cache miss, then compare exchange on it
2118  * once the cache promotion or backing device read completes. This
2119  * ensures that if this key is written to after the read, the read will
2120  * lose and not overwrite the key with stale data.
2121  *
2122  * Return values:
2123  * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
2124  * -EINTR: btree node was changed while upgrading to write lock
2125  */
2126 int bch2_btree_insert_check_key(struct btree_iter *iter,
2127                                struct bkey_i *check_key)
2128 {
2129         struct bpos saved_pos = iter->pos;
2130         struct bkey_i_cookie *cookie;
2131         BKEY_PADDED(key) tmp;
2132         int ret;
2133
2134         BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
2135
2136         check_key->k.type = KEY_TYPE_COOKIE;
2137         set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
2138
2139         cookie = bkey_i_to_cookie(check_key);
2140         get_random_bytes(&cookie->v, sizeof(cookie->v));
2141
2142         bkey_copy(&tmp.key, check_key);
2143
2144         ret = bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
2145                                   BTREE_INSERT_ATOMIC,
2146                                   BTREE_INSERT_ENTRY(iter, &tmp.key));
2147
2148         bch2_btree_iter_rewind(iter, saved_pos);
2149
2150         return ret;
2151 }
2152
2153 /**
2154  * bch_btree_insert - insert keys into the extent btree
2155  * @c:                  pointer to struct bch_fs
2156  * @id:                 btree to insert into
2157  * @insert_keys:        list of keys to insert
2158  * @hook:               insert callback
2159  */
2160 int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
2161                      struct bkey_i *k,
2162                      struct disk_reservation *disk_res,
2163                      struct extent_insert_hook *hook,
2164                      u64 *journal_seq, int flags)
2165 {
2166         struct btree_iter iter;
2167         int ret, ret2;
2168
2169         bch2_btree_iter_init_intent(&iter, c, id, bkey_start_pos(&k->k));
2170
2171         ret = bch2_btree_iter_traverse(&iter);
2172         if (unlikely(ret))
2173                 goto out;
2174
2175         ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
2176                                   BTREE_INSERT_ENTRY(&iter, k));
2177 out:    ret2 = bch2_btree_iter_unlock(&iter);
2178
2179         return ret ?: ret2;
2180 }
2181
2182 /**
2183  * bch_btree_update - like bch2_btree_insert(), but asserts that we're
2184  * overwriting an existing key
2185  */
2186 int bch2_btree_update(struct bch_fs *c, enum btree_id id,
2187                      struct bkey_i *k, u64 *journal_seq)
2188 {
2189         struct btree_iter iter;
2190         struct bkey_s_c u;
2191         int ret;
2192
2193         EBUG_ON(id == BTREE_ID_EXTENTS);
2194
2195         bch2_btree_iter_init_intent(&iter, c, id, k->k.p);
2196
2197         u = bch2_btree_iter_peek_with_holes(&iter);
2198         ret = btree_iter_err(u);
2199         if (ret)
2200                 return ret;
2201
2202         if (bkey_deleted(u.k)) {
2203                 bch2_btree_iter_unlock(&iter);
2204                 return -ENOENT;
2205         }
2206
2207         ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0,
2208                                   BTREE_INSERT_ENTRY(&iter, k));
2209         bch2_btree_iter_unlock(&iter);
2210         return ret;
2211 }
2212
2213 /*
2214  * bch_btree_delete_range - delete everything within a given range
2215  *
2216  * Range is a half open interval - [start, end)
2217  */
2218 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
2219                            struct bpos start,
2220                            struct bpos end,
2221                            struct bversion version,
2222                            struct disk_reservation *disk_res,
2223                            struct extent_insert_hook *hook,
2224                            u64 *journal_seq)
2225 {
2226         struct btree_iter iter;
2227         struct bkey_s_c k;
2228         int ret = 0;
2229
2230         bch2_btree_iter_init_intent(&iter, c, id, start);
2231
2232         while ((k = bch2_btree_iter_peek(&iter)).k &&
2233                !(ret = btree_iter_err(k))) {
2234                 unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
2235                 /* really shouldn't be using a bare, unpadded bkey_i */
2236                 struct bkey_i delete;
2237
2238                 if (bkey_cmp(iter.pos, end) >= 0)
2239                         break;
2240
2241                 bkey_init(&delete.k);
2242
2243                 /*
2244                  * For extents, iter.pos won't necessarily be the same as
2245                  * bkey_start_pos(k.k) (for non extents they always will be the
2246                  * same). It's important that we delete starting from iter.pos
2247                  * because the range we want to delete could start in the middle
2248                  * of k.
2249                  *
2250                  * (bch2_btree_iter_peek() does guarantee that iter.pos >=
2251                  * bkey_start_pos(k.k)).
2252                  */
2253                 delete.k.p = iter.pos;
2254                 delete.k.version = version;
2255
2256                 if (iter.is_extents) {
2257                         /*
2258                          * The extents btree is special - KEY_TYPE_DISCARD is
2259                          * used for deletions, not KEY_TYPE_DELETED. This is an
2260                          * internal implementation detail that probably
2261                          * shouldn't be exposed (internally, KEY_TYPE_DELETED is
2262                          * used as a proxy for k->size == 0):
2263                          */
2264                         delete.k.type = KEY_TYPE_DISCARD;
2265
2266                         /* create the biggest key we can */
2267                         bch2_key_resize(&delete.k, max_sectors);
2268                         bch2_cut_back(end, &delete.k);
2269                 }
2270
2271                 ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
2272                                           BTREE_INSERT_NOFAIL,
2273                                           BTREE_INSERT_ENTRY(&iter, &delete));
2274                 if (ret)
2275                         break;
2276
2277                 bch2_btree_iter_cond_resched(&iter);
2278         }
2279
2280         bch2_btree_iter_unlock(&iter);
2281         return ret;
2282 }
2283
2284 /**
2285  * bch_btree_node_rewrite - Rewrite/move a btree node
2286  *
2287  * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e.
2288  * btree_check_reserve() has to wait)
2289  */
2290 int bch2_btree_node_rewrite(struct btree_iter *iter, struct btree *b,
2291                            struct closure *cl)
2292 {
2293         struct bch_fs *c = iter->c;
2294         struct btree *n, *parent = iter->nodes[b->level + 1];
2295         struct btree_reserve *reserve;
2296         struct btree_interior_update *as;
2297         unsigned flags = BTREE_INSERT_NOFAIL;
2298
2299         /*
2300          * if caller is going to wait if allocating reserve fails, then this is
2301          * a rewrite that must succeed:
2302          */
2303         if (cl)
2304                 flags |= BTREE_INSERT_USE_RESERVE;
2305
2306         if (!bch2_btree_iter_set_locks_want(iter, U8_MAX))
2307                 return -EINTR;
2308
2309         reserve = bch2_btree_reserve_get(c, b, 0, flags, cl);
2310         if (IS_ERR(reserve)) {
2311                 trace_btree_gc_rewrite_node_fail(c, b);
2312                 return PTR_ERR(reserve);
2313         }
2314
2315         as = bch2_btree_interior_update_alloc(c);
2316
2317         bch2_btree_interior_update_will_free_node(c, as, b);
2318
2319         n = bch2_btree_node_alloc_replacement(c, b, reserve);
2320
2321         bch2_btree_build_aux_trees(n);
2322         six_unlock_write(&n->lock);
2323
2324         trace_btree_gc_rewrite_node(c, b);
2325
2326         bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent, -1);
2327
2328         if (parent) {
2329                 bch2_btree_insert_node(parent, iter,
2330                                       &keylist_single(&n->key),
2331                                       reserve, as);
2332         } else {
2333                 bch2_btree_set_root(iter, n, as, reserve);
2334         }
2335
2336         bch2_btree_open_bucket_put(c, n);
2337
2338         bch2_btree_node_free_inmem(iter, b);
2339
2340         BUG_ON(!bch2_btree_iter_node_replace(iter, n));
2341
2342         bch2_btree_reserve_put(c, reserve);
2343         return 0;
2344 }