]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_io.c
Update bcachefs sources to 15f6e66e86 bcachefs: pass around bset_tree less
[bcachefs-tools-debian] / libbcachefs / btree_io.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_iter.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "buckets.h"
11 #include "checksum.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "io.h"
16 #include "journal_reclaim.h"
17 #include "journal_seq_blacklist.h"
18 #include "super-io.h"
19
20 #include <trace/events/bcachefs.h>
21
22 /* btree_node_iter_large: */
23
24 #define btree_node_iter_cmp_heap(h, _l, _r)                             \
25         __btree_node_iter_cmp(b,                                        \
26                                __btree_node_offset_to_key(b, (_l).k),   \
27                                __btree_node_offset_to_key(b, (_r).k))
28
29 void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
30                                      struct btree *b,
31                                      const struct bkey_packed *k,
32                                      const struct bkey_packed *end)
33 {
34         if (k != end) {
35                 struct btree_node_iter_set n =
36                         ((struct btree_node_iter_set) {
37                                  __btree_node_key_to_offset(b, k),
38                                  __btree_node_key_to_offset(b, end)
39                          });
40
41                 __heap_add(iter, n, btree_node_iter_cmp_heap);
42         }
43 }
44
45 void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
46                                         struct btree *b)
47 {
48         iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
49
50         EBUG_ON(!iter->used);
51         EBUG_ON(iter->data->k > iter->data->end);
52
53         if (iter->data->k == iter->data->end)
54                 heap_del(iter, 0, btree_node_iter_cmp_heap);
55         else
56                 heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
57 }
58
59 static void verify_no_dups(struct btree *b,
60                            struct bkey_packed *start,
61                            struct bkey_packed *end)
62 {
63 #ifdef CONFIG_BCACHEFS_DEBUG
64         struct bkey_packed *k;
65
66         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
67                 struct bkey l = bkey_unpack_key(b, k);
68                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
69
70                 BUG_ON(btree_node_is_extents(b)
71                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
72                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
73                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
74         }
75 #endif
76 }
77
78 static void clear_needs_whiteout(struct bset *i)
79 {
80         struct bkey_packed *k;
81
82         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
83                 k->needs_whiteout = false;
84 }
85
86 static void set_needs_whiteout(struct bset *i)
87 {
88         struct bkey_packed *k;
89
90         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
91                 k->needs_whiteout = true;
92 }
93
94 static void btree_bounce_free(struct bch_fs *c, unsigned order,
95                               bool used_mempool, void *p)
96 {
97         if (used_mempool)
98                 mempool_free(p, &c->btree_bounce_pool);
99         else
100                 vpfree(p, PAGE_SIZE << order);
101 }
102
103 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
104                                 bool *used_mempool)
105 {
106         void *p;
107
108         BUG_ON(order > btree_page_order(c));
109
110         *used_mempool = false;
111         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
112         if (p)
113                 return p;
114
115         *used_mempool = true;
116         return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
117 }
118
119 typedef int (*sort_cmp_fn)(struct btree *,
120                            struct bkey_packed *,
121                            struct bkey_packed *);
122
123 struct sort_iter {
124         struct btree    *b;
125         unsigned                used;
126
127         struct sort_iter_set {
128                 struct bkey_packed *k, *end;
129         } data[MAX_BSETS + 1];
130 };
131
132 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
133 {
134         memset(iter, 0, sizeof(*iter));
135         iter->b = b;
136 }
137
138 static inline void __sort_iter_sift(struct sort_iter *iter,
139                                     unsigned from,
140                                     sort_cmp_fn cmp)
141 {
142         unsigned i;
143
144         for (i = from;
145              i + 1 < iter->used &&
146              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
147              i++)
148                 swap(iter->data[i], iter->data[i + 1]);
149 }
150
151 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
152 {
153
154         __sort_iter_sift(iter, 0, cmp);
155 }
156
157 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
158 {
159         unsigned i = iter->used;
160
161         while (i--)
162                 __sort_iter_sift(iter, i, cmp);
163 }
164
165 static void sort_iter_add(struct sort_iter *iter,
166                           struct bkey_packed *k,
167                           struct bkey_packed *end)
168 {
169         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
170
171         if (k != end)
172                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
173 }
174
175 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
176 {
177         return iter->used ? iter->data->k : NULL;
178 }
179
180 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
181 {
182         iter->data->k = bkey_next(iter->data->k);
183
184         BUG_ON(iter->data->k > iter->data->end);
185
186         if (iter->data->k == iter->data->end)
187                 array_remove_item(iter->data, iter->used, 0);
188         else
189                 sort_iter_sift(iter, cmp);
190 }
191
192 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
193                                                  sort_cmp_fn cmp)
194 {
195         struct bkey_packed *ret = sort_iter_peek(iter);
196
197         if (ret)
198                 sort_iter_advance(iter, cmp);
199
200         return ret;
201 }
202
203 static inline int sort_key_whiteouts_cmp(struct btree *b,
204                                          struct bkey_packed *l,
205                                          struct bkey_packed *r)
206 {
207         return bkey_cmp_packed(b, l, r);
208 }
209
210 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
211                                    struct sort_iter *iter)
212 {
213         struct bkey_packed *in, *out = dst;
214
215         sort_iter_sort(iter, sort_key_whiteouts_cmp);
216
217         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
218                 bkey_copy(out, in);
219                 out = bkey_next(out);
220         }
221
222         return (u64 *) out - (u64 *) dst;
223 }
224
225 static inline int sort_extent_whiteouts_cmp(struct btree *b,
226                                             struct bkey_packed *l,
227                                             struct bkey_packed *r)
228 {
229         struct bkey ul = bkey_unpack_key(b, l);
230         struct bkey ur = bkey_unpack_key(b, r);
231
232         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
233 }
234
235 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
236                                       struct sort_iter *iter)
237 {
238         const struct bkey_format *f = &iter->b->format;
239         struct bkey_packed *in, *out = dst;
240         struct bkey_i l, r;
241         bool prev = false, l_packed = false;
242         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
243         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
244         u64 new_size;
245
246         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
247
248         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
249
250         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
251                 if (bkey_deleted(in))
252                         continue;
253
254                 EBUG_ON(bkeyp_val_u64s(f, in));
255                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
256
257                 r.k = bkey_unpack_key(iter->b, in);
258
259                 if (prev &&
260                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
261                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
262                                 continue;
263
264                         new_size = l_packed
265                                 ? min(max_packed_size, max_packed_offset -
266                                       bkey_start_offset(&l.k))
267                                 : KEY_SIZE_MAX;
268
269                         new_size = min(new_size, r.k.p.offset -
270                                        bkey_start_offset(&l.k));
271
272                         BUG_ON(new_size < l.k.size);
273
274                         bch2_key_resize(&l.k, new_size);
275
276                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
277                                 continue;
278
279                         bch2_cut_front(l.k.p, &r);
280                 }
281
282                 if (prev) {
283                         if (!bch2_bkey_pack(out, &l, f)) {
284                                 BUG_ON(l_packed);
285                                 bkey_copy(out, &l);
286                         }
287                         out = bkey_next(out);
288                 }
289
290                 l = r;
291                 prev = true;
292                 l_packed = bkey_packed(in);
293         }
294
295         if (prev) {
296                 if (!bch2_bkey_pack(out, &l, f)) {
297                         BUG_ON(l_packed);
298                         bkey_copy(out, &l);
299                 }
300                 out = bkey_next(out);
301         }
302
303         return (u64 *) out - (u64 *) dst;
304 }
305
306 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
307                                     bool compacting,
308                                     enum compact_mode mode)
309 {
310         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
311         unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
312
313         if (mode == COMPACT_LAZY) {
314                 if (should_compact_bset_lazy(b, t) ||
315                     (compacting && !bset_written(b, bset(b, t))))
316                         return dead_u64s;
317         } else {
318                 if (bset_written(b, bset(b, t)))
319                         return dead_u64s;
320         }
321
322         return 0;
323 }
324
325 bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
326                              enum compact_mode mode)
327 {
328         const struct bkey_format *f = &b->format;
329         struct bset_tree *t;
330         struct bkey_packed *whiteouts = NULL;
331         struct bkey_packed *u_start, *u_pos;
332         struct sort_iter sort_iter;
333         unsigned order, whiteout_u64s = 0, u64s;
334         bool used_mempool, compacting = false;
335
336         for_each_bset(b, t)
337                 whiteout_u64s += should_compact_bset(b, t,
338                                         whiteout_u64s != 0, mode);
339
340         if (!whiteout_u64s)
341                 return false;
342
343         sort_iter_init(&sort_iter, b);
344
345         whiteout_u64s += b->whiteout_u64s;
346         order = get_order(whiteout_u64s * sizeof(u64));
347
348         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
349         u_start = u_pos = whiteouts;
350
351         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
352                     b->whiteout_u64s);
353         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
354
355         sort_iter_add(&sort_iter, u_start, u_pos);
356
357         for_each_bset(b, t) {
358                 struct bset *i = bset(b, t);
359                 struct bkey_packed *k, *n, *out, *start, *end;
360                 struct btree_node_entry *src = NULL, *dst = NULL;
361
362                 if (t != b->set && !bset_written(b, i)) {
363                         src = container_of(i, struct btree_node_entry, keys);
364                         dst = max(write_block(b),
365                                   (void *) btree_bkey_last(b, t -1));
366                 }
367
368                 if (!should_compact_bset(b, t, compacting, mode)) {
369                         if (src != dst) {
370                                 memmove(dst, src, sizeof(*src) +
371                                         le16_to_cpu(src->keys.u64s) *
372                                         sizeof(u64));
373                                 i = &dst->keys;
374                                 set_btree_bset(b, t, i);
375                         }
376                         continue;
377                 }
378
379                 compacting = true;
380                 u_start = u_pos;
381                 start = i->start;
382                 end = vstruct_last(i);
383
384                 if (src != dst) {
385                         memmove(dst, src, sizeof(*src));
386                         i = &dst->keys;
387                         set_btree_bset(b, t, i);
388                 }
389
390                 out = i->start;
391
392                 for (k = start; k != end; k = n) {
393                         n = bkey_next(k);
394
395                         if (bkey_deleted(k) && btree_node_is_extents(b))
396                                 continue;
397
398                         if (bkey_whiteout(k) && !k->needs_whiteout)
399                                 continue;
400
401                         if (bkey_whiteout(k)) {
402                                 unreserve_whiteout(b, k);
403                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
404                                 set_bkeyp_val_u64s(f, u_pos, 0);
405                                 u_pos = bkey_next(u_pos);
406                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
407                                 bkey_copy(out, k);
408                                 out = bkey_next(out);
409                         }
410                 }
411
412                 sort_iter_add(&sort_iter, u_start, u_pos);
413
414                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
415                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
416                         set_btree_bset_end(b, t);
417                         bch2_bset_set_no_aux_tree(b, t);
418                 }
419         }
420
421         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
422
423         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
424                (void *) btree_bkey_last(b, bset_tree_last(b)));
425
426         u64s = btree_node_is_extents(b)
427                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
428                                         &sort_iter)
429                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
430                                      &sort_iter);
431
432         BUG_ON(u64s > b->whiteout_u64s);
433         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
434         BUG_ON(u_pos != whiteouts && !u64s);
435
436         if (u64s != b->whiteout_u64s) {
437                 void *src = unwritten_whiteouts_start(c, b);
438
439                 b->whiteout_u64s = u64s;
440                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
441         }
442
443         verify_no_dups(b,
444                        unwritten_whiteouts_start(c, b),
445                        unwritten_whiteouts_end(c, b));
446
447         btree_bounce_free(c, order, used_mempool, whiteouts);
448
449         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
450                 bch2_btree_build_aux_trees(b);
451
452         bch_btree_keys_u64s_remaining(c, b);
453         bch2_verify_btree_nr_keys(b);
454
455         return true;
456 }
457
458 static bool bch2_drop_whiteouts(struct btree *b)
459 {
460         struct bset_tree *t;
461         bool ret = false;
462
463         for_each_bset(b, t) {
464                 struct bset *i = bset(b, t);
465                 struct bkey_packed *k, *n, *out, *start, *end;
466
467                 if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
468                         continue;
469
470                 start   = btree_bkey_first(b, t);
471                 end     = btree_bkey_last(b, t);
472
473                 if (!bset_written(b, i) &&
474                     t != b->set) {
475                         struct bset *dst =
476                                max_t(struct bset *, write_block(b),
477                                      (void *) btree_bkey_last(b, t -1));
478
479                         memmove(dst, i, sizeof(struct bset));
480                         i = dst;
481                         set_btree_bset(b, t, i);
482                 }
483
484                 out = i->start;
485
486                 for (k = start; k != end; k = n) {
487                         n = bkey_next(k);
488
489                         if (!bkey_whiteout(k)) {
490                                 bkey_copy(out, k);
491                                 out = bkey_next(out);
492                         }
493                 }
494
495                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
496                 bch2_bset_set_no_aux_tree(b, t);
497                 ret = true;
498         }
499
500         bch2_verify_btree_nr_keys(b);
501
502         return ret;
503 }
504
505 static inline int sort_keys_cmp(struct btree *b,
506                                 struct bkey_packed *l,
507                                 struct bkey_packed *r)
508 {
509         return bkey_cmp_packed(b, l, r) ?:
510                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
511                 (int) l->needs_whiteout - (int) r->needs_whiteout;
512 }
513
514 static unsigned sort_keys(struct bkey_packed *dst,
515                           struct sort_iter *iter,
516                           bool filter_whiteouts)
517 {
518         const struct bkey_format *f = &iter->b->format;
519         struct bkey_packed *in, *next, *out = dst;
520
521         sort_iter_sort(iter, sort_keys_cmp);
522
523         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
524                 if (bkey_whiteout(in) &&
525                     (filter_whiteouts || !in->needs_whiteout))
526                         continue;
527
528                 if (bkey_whiteout(in) &&
529                     (next = sort_iter_peek(iter)) &&
530                     !bkey_cmp_packed(iter->b, in, next)) {
531                         BUG_ON(in->needs_whiteout &&
532                                next->needs_whiteout);
533                         /*
534                          * XXX racy, called with read lock from write path
535                          *
536                          * leads to spurious BUG_ON() in bkey_unpack_key() in
537                          * debug mode
538                          */
539                         next->needs_whiteout |= in->needs_whiteout;
540                         continue;
541                 }
542
543                 if (bkey_whiteout(in)) {
544                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
545                         set_bkeyp_val_u64s(f, out, 0);
546                 } else {
547                         bkey_copy(out, in);
548                 }
549                 out = bkey_next(out);
550         }
551
552         return (u64 *) out - (u64 *) dst;
553 }
554
555 static inline int sort_extents_cmp(struct btree *b,
556                                    struct bkey_packed *l,
557                                    struct bkey_packed *r)
558 {
559         return bkey_cmp_packed(b, l, r) ?:
560                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
561 }
562
563 static unsigned sort_extents(struct bkey_packed *dst,
564                              struct sort_iter *iter,
565                              bool filter_whiteouts)
566 {
567         struct bkey_packed *in, *out = dst;
568
569         sort_iter_sort(iter, sort_extents_cmp);
570
571         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
572                 if (bkey_deleted(in))
573                         continue;
574
575                 if (bkey_whiteout(in) &&
576                     (filter_whiteouts || !in->needs_whiteout))
577                         continue;
578
579                 bkey_copy(out, in);
580                 out = bkey_next(out);
581         }
582
583         return (u64 *) out - (u64 *) dst;
584 }
585
586 static void btree_node_sort(struct bch_fs *c, struct btree *b,
587                             struct btree_iter *iter,
588                             unsigned start_idx,
589                             unsigned end_idx,
590                             bool filter_whiteouts)
591 {
592         struct btree_node *out;
593         struct sort_iter sort_iter;
594         struct bset_tree *t;
595         struct bset *start_bset = bset(b, &b->set[start_idx]);
596         bool used_mempool = false;
597         u64 start_time, seq = 0;
598         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
599         bool sorting_entire_node = start_idx == 0 &&
600                 end_idx == b->nsets;
601
602         sort_iter_init(&sort_iter, b);
603
604         for (t = b->set + start_idx;
605              t < b->set + end_idx;
606              t++) {
607                 u64s += le16_to_cpu(bset(b, t)->u64s);
608                 sort_iter_add(&sort_iter,
609                               btree_bkey_first(b, t),
610                               btree_bkey_last(b, t));
611         }
612
613         order = sorting_entire_node
614                 ? btree_page_order(c)
615                 : get_order(__vstruct_bytes(struct btree_node, u64s));
616
617         out = btree_bounce_alloc(c, order, &used_mempool);
618
619         start_time = local_clock();
620
621         if (btree_node_is_extents(b))
622                 filter_whiteouts = bset_written(b, start_bset);
623
624         u64s = btree_node_is_extents(b)
625                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
626                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
627
628         out->keys.u64s = cpu_to_le16(u64s);
629
630         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
631
632         if (sorting_entire_node)
633                 bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
634                                        start_time);
635
636         /* Make sure we preserve bset journal_seq: */
637         for (t = b->set + start_idx; t < b->set + end_idx; t++)
638                 seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
639         start_bset->journal_seq = cpu_to_le64(seq);
640
641         if (sorting_entire_node) {
642                 unsigned u64s = le16_to_cpu(out->keys.u64s);
643
644                 BUG_ON(order != btree_page_order(c));
645
646                 /*
647                  * Our temporary buffer is the same size as the btree node's
648                  * buffer, we can just swap buffers instead of doing a big
649                  * memcpy()
650                  */
651                 *out = *b->data;
652                 out->keys.u64s = cpu_to_le16(u64s);
653                 swap(out, b->data);
654                 set_btree_bset(b, b->set, &b->data->keys);
655         } else {
656                 start_bset->u64s = out->keys.u64s;
657                 memcpy_u64s(start_bset->start,
658                             out->keys.start,
659                             le16_to_cpu(out->keys.u64s));
660         }
661
662         for (i = start_idx + 1; i < end_idx; i++)
663                 b->nr.bset_u64s[start_idx] +=
664                         b->nr.bset_u64s[i];
665
666         b->nsets -= shift;
667
668         for (i = start_idx + 1; i < b->nsets; i++) {
669                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
670                 b->set[i]               = b->set[i + shift];
671         }
672
673         for (i = b->nsets; i < MAX_BSETS; i++)
674                 b->nr.bset_u64s[i] = 0;
675
676         set_btree_bset_end(b, &b->set[start_idx]);
677         bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
678
679         btree_bounce_free(c, order, used_mempool, out);
680
681         bch2_verify_btree_nr_keys(b);
682 }
683
684 /* Sort + repack in a new format: */
685 static struct btree_nr_keys sort_repack(struct bset *dst,
686                                         struct btree *src,
687                                         struct btree_node_iter *src_iter,
688                                         struct bkey_format *out_f,
689                                         bool filter_whiteouts)
690 {
691         struct bkey_format *in_f = &src->format;
692         struct bkey_packed *in, *out = vstruct_last(dst);
693         struct btree_nr_keys nr;
694
695         memset(&nr, 0, sizeof(nr));
696
697         while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
698                 if (filter_whiteouts && bkey_whiteout(in))
699                         continue;
700
701                 if (bch2_bkey_transform(out_f, out, bkey_packed(in)
702                                        ? in_f : &bch2_bkey_format_current, in))
703                         out->format = KEY_FORMAT_LOCAL_BTREE;
704                 else
705                         bch2_bkey_unpack(src, (void *) out, in);
706
707                 btree_keys_account_key_add(&nr, 0, out);
708                 out = bkey_next(out);
709         }
710
711         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
712         return nr;
713 }
714
715 /* Sort, repack, and merge: */
716 static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
717                                               struct bset *dst,
718                                               struct btree *src,
719                                               struct btree_node_iter *iter,
720                                               struct bkey_format *out_f,
721                                               bool filter_whiteouts,
722                                               key_filter_fn filter,
723                                               key_merge_fn merge)
724 {
725         struct bkey_packed *k, *prev = NULL, *out;
726         struct btree_nr_keys nr;
727         BKEY_PADDED(k) tmp;
728
729         memset(&nr, 0, sizeof(nr));
730
731         while ((k = bch2_btree_node_iter_next_all(iter, src))) {
732                 if (filter_whiteouts && bkey_whiteout(k))
733                         continue;
734
735                 /*
736                  * The filter might modify pointers, so we have to unpack the
737                  * key and values to &tmp.k:
738                  */
739                 bch2_bkey_unpack(src, &tmp.k, k);
740
741                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
742                         continue;
743
744                 /* prev is always unpacked, for key merging: */
745
746                 if (prev &&
747                     merge &&
748                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
749                         continue;
750
751                 /*
752                  * the current key becomes the new prev: advance prev, then
753                  * copy the current key - but first pack prev (in place):
754                  */
755                 if (prev) {
756                         bch2_bkey_pack(prev, (void *) prev, out_f);
757
758                         btree_keys_account_key_add(&nr, 0, prev);
759                         prev = bkey_next(prev);
760                 } else {
761                         prev = vstruct_last(dst);
762                 }
763
764                 bkey_copy(prev, &tmp.k);
765         }
766
767         if (prev) {
768                 bch2_bkey_pack(prev, (void *) prev, out_f);
769                 btree_keys_account_key_add(&nr, 0, prev);
770                 out = bkey_next(prev);
771         } else {
772                 out = vstruct_last(dst);
773         }
774
775         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
776         return nr;
777 }
778
779 void bch2_btree_sort_into(struct bch_fs *c,
780                          struct btree *dst,
781                          struct btree *src)
782 {
783         struct btree_nr_keys nr;
784         struct btree_node_iter src_iter;
785         u64 start_time = local_clock();
786
787         BUG_ON(dst->nsets != 1);
788
789         bch2_bset_set_no_aux_tree(dst, dst->set);
790
791         bch2_btree_node_iter_init_from_start(&src_iter, src);
792
793         if (btree_node_ops(src)->key_normalize ||
794             btree_node_ops(src)->key_merge)
795                 nr = sort_repack_merge(c, btree_bset_first(dst),
796                                 src, &src_iter,
797                                 &dst->format,
798                                 true,
799                                 btree_node_ops(src)->key_normalize,
800                                 btree_node_ops(src)->key_merge);
801         else
802                 nr = sort_repack(btree_bset_first(dst),
803                                 src, &src_iter,
804                                 &dst->format,
805                                 true);
806
807         bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
808
809         set_btree_bset_end(dst, dst->set);
810
811         dst->nr.live_u64s       += nr.live_u64s;
812         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
813         dst->nr.packed_keys     += nr.packed_keys;
814         dst->nr.unpacked_keys   += nr.unpacked_keys;
815
816         bch2_verify_btree_nr_keys(dst);
817 }
818
819 #define SORT_CRIT       (4096 / sizeof(u64))
820
821 /*
822  * We're about to add another bset to the btree node, so if there's currently
823  * too many bsets - sort some of them together:
824  */
825 static bool btree_node_compact(struct bch_fs *c, struct btree *b,
826                                struct btree_iter *iter)
827 {
828         unsigned unwritten_idx;
829         bool ret = false;
830
831         for (unwritten_idx = 0;
832              unwritten_idx < b->nsets;
833              unwritten_idx++)
834                 if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
835                         break;
836
837         if (b->nsets - unwritten_idx > 1) {
838                 btree_node_sort(c, b, iter, unwritten_idx,
839                                 b->nsets, false);
840                 ret = true;
841         }
842
843         if (unwritten_idx > 1) {
844                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
845                 ret = true;
846         }
847
848         return ret;
849 }
850
851 void bch2_btree_build_aux_trees(struct btree *b)
852 {
853         struct bset_tree *t;
854
855         for_each_bset(b, t)
856                 bch2_bset_build_aux_tree(b, t,
857                                 !bset_written(b, bset(b, t)) &&
858                                 t == bset_tree_last(b));
859 }
860
861 /*
862  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
863  * inserted into
864  *
865  * Safe to call if there already is an unwritten bset - will only add a new bset
866  * if @b doesn't already have one.
867  *
868  * Returns true if we sorted (i.e. invalidated iterators
869  */
870 void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
871                           struct btree_iter *iter)
872 {
873         struct btree_node_entry *bne;
874         bool did_sort;
875
876         EBUG_ON(!(b->lock.state.seq & 1));
877         EBUG_ON(iter && iter->l[b->level].b != b);
878
879         did_sort = btree_node_compact(c, b, iter);
880
881         bne = want_new_bset(c, b);
882         if (bne)
883                 bch2_bset_init_next(c, b, bne);
884
885         bch2_btree_build_aux_trees(b);
886
887         if (iter && did_sort)
888                 bch2_btree_iter_reinit_node(iter, b);
889 }
890
891 static struct nonce btree_nonce(struct bset *i, unsigned offset)
892 {
893         return (struct nonce) {{
894                 [0] = cpu_to_le32(offset),
895                 [1] = ((__le32 *) &i->seq)[0],
896                 [2] = ((__le32 *) &i->seq)[1],
897                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
898         }};
899 }
900
901 static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
902 {
903         struct nonce nonce = btree_nonce(i, offset);
904
905         if (!offset) {
906                 struct btree_node *bn = container_of(i, struct btree_node, keys);
907                 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
908
909                 bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
910                              bytes);
911
912                 nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
913         }
914
915         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
916                      vstruct_end(i) - (void *) i->_data);
917 }
918
919 static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
920                          unsigned offset, int write, char *buf, size_t len)
921 {
922         char *out = buf, *end = buf + len;
923
924         out += scnprintf(out, end - out,
925                          "error validating btree node %s"
926                          "at btree %u level %u/%u\n"
927                          "pos %llu:%llu node offset %u",
928                          write ? "before write " : "",
929                          b->btree_id, b->level,
930                          c->btree_roots[b->btree_id].level,
931                          b->key.k.p.inode, b->key.k.p.offset,
932                          b->written);
933         if (i)
934                 out += scnprintf(out, end - out,
935                                  " bset u64s %u",
936                                  le16_to_cpu(i->u64s));
937
938         return out - buf;
939 }
940
941 enum btree_err_type {
942         BTREE_ERR_FIXABLE,
943         BTREE_ERR_WANT_RETRY,
944         BTREE_ERR_MUST_RETRY,
945         BTREE_ERR_FATAL,
946 };
947
948 enum btree_validate_ret {
949         BTREE_RETRY_READ = 64,
950 };
951
952 #define btree_err(type, c, b, i, msg, ...)                              \
953 ({                                                                      \
954         __label__ out;                                                  \
955         char _buf[300], *out = _buf, *end = out + sizeof(_buf);         \
956                                                                         \
957         out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
958         out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);      \
959                                                                         \
960         if (type == BTREE_ERR_FIXABLE &&                                \
961             write == READ &&                                            \
962             !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
963                 mustfix_fsck_err(c, "%s", _buf);                        \
964                 goto out;                                               \
965         }                                                               \
966                                                                         \
967         switch (write) {                                                \
968         case READ:                                                      \
969                 bch_err(c, "%s", _buf);                                 \
970                                                                         \
971                 switch (type) {                                         \
972                 case BTREE_ERR_FIXABLE:                                 \
973                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
974                         goto fsck_err;                                  \
975                 case BTREE_ERR_WANT_RETRY:                              \
976                         if (have_retry) {                               \
977                                 ret = BTREE_RETRY_READ;                 \
978                                 goto fsck_err;                          \
979                         }                                               \
980                         break;                                          \
981                 case BTREE_ERR_MUST_RETRY:                              \
982                         ret = BTREE_RETRY_READ;                         \
983                         goto fsck_err;                                  \
984                 case BTREE_ERR_FATAL:                                   \
985                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
986                         goto fsck_err;                                  \
987                 }                                                       \
988                 break;                                                  \
989         case WRITE:                                                     \
990                 bch_err(c, "corrupt metadata before write: %s", _buf);  \
991                                                                         \
992                 if (bch2_fs_inconsistent(c)) {                          \
993                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
994                         goto fsck_err;                                  \
995                 }                                                       \
996                 break;                                                  \
997         }                                                               \
998 out:                                                                    \
999         true;                                                           \
1000 })
1001
1002 #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
1003
1004 static int validate_bset(struct bch_fs *c, struct btree *b,
1005                          struct bset *i, unsigned sectors,
1006                          unsigned *whiteout_u64s, int write,
1007                          bool have_retry)
1008 {
1009         struct bkey_packed *k, *prev = NULL;
1010         struct bpos prev_pos = POS_MIN;
1011         enum bkey_type type = btree_node_type(b);
1012         bool seen_non_whiteout = false;
1013         const char *err;
1014         int ret = 0;
1015
1016         if (i == &b->data->keys) {
1017                 /* These indicate that we read the wrong btree node: */
1018                 btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
1019                              BTREE_ERR_MUST_RETRY, c, b, i,
1020                              "incorrect btree id");
1021
1022                 btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
1023                              BTREE_ERR_MUST_RETRY, c, b, i,
1024                              "incorrect level");
1025
1026                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
1027                         u64 *p = (u64 *) &b->data->ptr;
1028
1029                         *p = swab64(*p);
1030                         bch2_bpos_swab(&b->data->min_key);
1031                         bch2_bpos_swab(&b->data->max_key);
1032                 }
1033
1034                 btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
1035                              BTREE_ERR_MUST_RETRY, c, b, i,
1036                              "incorrect max key");
1037
1038                 /* XXX: ideally we would be validating min_key too */
1039 #if 0
1040                 /*
1041                  * not correct anymore, due to btree node write error
1042                  * handling
1043                  *
1044                  * need to add b->data->seq to btree keys and verify
1045                  * against that
1046                  */
1047                 btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
1048                                                   b->data->ptr),
1049                              BTREE_ERR_FATAL, c, b, i,
1050                              "incorrect backpointer");
1051 #endif
1052                 err = bch2_bkey_format_validate(&b->data->format);
1053                 btree_err_on(err,
1054                              BTREE_ERR_FATAL, c, b, i,
1055                              "invalid bkey format: %s", err);
1056         }
1057
1058         if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
1059                          BTREE_ERR_FIXABLE, c, b, i,
1060                          "unsupported bset version")) {
1061                 i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1062                 i->u64s = 0;
1063                 return 0;
1064         }
1065
1066         if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
1067                          BTREE_ERR_FIXABLE, c, b, i,
1068                          "bset past end of btree node")) {
1069                 i->u64s = 0;
1070                 return 0;
1071         }
1072
1073         btree_err_on(b->written && !i->u64s,
1074                      BTREE_ERR_FIXABLE, c, b, i,
1075                      "empty bset");
1076
1077         if (!BSET_SEPARATE_WHITEOUTS(i)) {
1078                 seen_non_whiteout = true;
1079                 *whiteout_u64s = 0;
1080         }
1081
1082         for (k = i->start;
1083              k != vstruct_last(i);) {
1084                 struct bkey_s_c u;
1085                 struct bkey tmp;
1086                 const char *invalid;
1087
1088                 if (btree_err_on(!k->u64s,
1089                                  BTREE_ERR_FIXABLE, c, b, i,
1090                                  "KEY_U64s 0: %zu bytes of metadata lost",
1091                                  vstruct_end(i) - (void *) k)) {
1092                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1093                         break;
1094                 }
1095
1096                 if (btree_err_on(bkey_next(k) > vstruct_last(i),
1097                                  BTREE_ERR_FIXABLE, c, b, i,
1098                                  "key extends past end of bset")) {
1099                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1100                         break;
1101                 }
1102
1103                 if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
1104                                  BTREE_ERR_FIXABLE, c, b, i,
1105                                  "invalid bkey format %u", k->format)) {
1106                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1107                         memmove_u64s_down(k, bkey_next(k),
1108                                           (u64 *) vstruct_end(i) - (u64 *) k);
1109                         continue;
1110                 }
1111
1112                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
1113                         bch2_bkey_swab(type, &b->format, k);
1114
1115                 u = bkey_disassemble(b, k, &tmp);
1116
1117                 invalid = __bch2_bkey_invalid(c, type, u) ?:
1118                         bch2_bkey_in_btree_node(b, u) ?:
1119                         (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
1120                 if (invalid) {
1121                         char buf[160];
1122
1123                         bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
1124                         btree_err(BTREE_ERR_FIXABLE, c, b, i,
1125                                   "invalid bkey:\n%s\n%s", invalid, buf);
1126
1127                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1128                         memmove_u64s_down(k, bkey_next(k),
1129                                           (u64 *) vstruct_end(i) - (u64 *) k);
1130                         continue;
1131                 }
1132
1133                 /*
1134                  * with the separate whiteouts thing (used for extents), the
1135                  * second set of keys actually can have whiteouts too, so we
1136                  * can't solely go off bkey_whiteout()...
1137                  */
1138
1139                 if (!seen_non_whiteout &&
1140                     (!bkey_whiteout(k) ||
1141                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
1142                         *whiteout_u64s = k->_data - i->_data;
1143                         seen_non_whiteout = true;
1144                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
1145                         btree_err(BTREE_ERR_FATAL, c, b, i,
1146                                   "keys out of order: %llu:%llu > %llu:%llu",
1147                                   prev_pos.inode,
1148                                   prev_pos.offset,
1149                                   u.k->p.inode,
1150                                   bkey_start_offset(u.k));
1151                         /* XXX: repair this */
1152                 }
1153
1154                 prev_pos = u.k->p;
1155                 prev = k;
1156                 k = bkey_next(k);
1157         }
1158
1159         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1160 fsck_err:
1161         return ret;
1162 }
1163
1164 int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
1165 {
1166         struct btree_node_entry *bne;
1167         struct btree_node_iter_large *iter;
1168         struct btree_node *sorted;
1169         struct bkey_packed *k;
1170         struct bset *i;
1171         bool used_mempool;
1172         unsigned u64s;
1173         int ret, retry_read = 0, write = READ;
1174
1175         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1176         iter->used = 0;
1177
1178         if (bch2_meta_read_fault("btree"))
1179                 btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
1180                           "dynamic fault");
1181
1182         btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
1183                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1184                      "bad magic");
1185
1186         btree_err_on(!b->data->keys.seq,
1187                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1188                      "bad btree header");
1189
1190         while (b->written < c->opts.btree_node_size) {
1191                 unsigned sectors, whiteout_u64s = 0;
1192                 struct nonce nonce;
1193                 struct bch_csum csum;
1194                 bool first = !b->written;
1195
1196                 if (!b->written) {
1197                         i = &b->data->keys;
1198
1199                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1200                                      BTREE_ERR_WANT_RETRY, c, b, i,
1201                                      "unknown checksum type");
1202
1203                         nonce = btree_nonce(i, b->written << 9);
1204                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1205
1206                         btree_err_on(bch2_crc_cmp(csum, b->data->csum),
1207                                      BTREE_ERR_WANT_RETRY, c, b, i,
1208                                      "invalid checksum");
1209
1210                         bset_encrypt(c, i, b->written << 9);
1211
1212                         sectors = vstruct_sectors(b->data, c->block_bits);
1213
1214                         btree_node_set_format(b, b->data->format);
1215                 } else {
1216                         bne = write_block(b);
1217                         i = &bne->keys;
1218
1219                         if (i->seq != b->data->keys.seq)
1220                                 break;
1221
1222                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1223                                      BTREE_ERR_WANT_RETRY, c, b, i,
1224                                      "unknown checksum type");
1225
1226                         nonce = btree_nonce(i, b->written << 9);
1227                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1228
1229                         btree_err_on(bch2_crc_cmp(csum, bne->csum),
1230                                      BTREE_ERR_WANT_RETRY, c, b, i,
1231                                      "invalid checksum");
1232
1233                         bset_encrypt(c, i, b->written << 9);
1234
1235                         sectors = vstruct_sectors(bne, c->block_bits);
1236                 }
1237
1238                 ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
1239                                     READ, have_retry);
1240                 if (ret)
1241                         goto fsck_err;
1242
1243                 b->written += sectors;
1244
1245                 ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1246                 if (ret < 0) {
1247                         btree_err(BTREE_ERR_FATAL, c, b, i,
1248                                   "insufficient memory");
1249                         goto err;
1250                 }
1251
1252                 if (ret) {
1253                         btree_err_on(first,
1254                                      BTREE_ERR_FIXABLE, c, b, i,
1255                                      "first btree node bset has blacklisted journal seq");
1256                         if (!first)
1257                                 continue;
1258                 }
1259
1260                 bch2_btree_node_iter_large_push(iter, b,
1261                                            i->start,
1262                                            vstruct_idx(i, whiteout_u64s));
1263
1264                 bch2_btree_node_iter_large_push(iter, b,
1265                                            vstruct_idx(i, whiteout_u64s),
1266                                            vstruct_last(i));
1267         }
1268
1269         for (bne = write_block(b);
1270              bset_byte_offset(b, bne) < btree_bytes(c);
1271              bne = (void *) bne + block_bytes(c))
1272                 btree_err_on(bne->keys.seq == b->data->keys.seq,
1273                              BTREE_ERR_WANT_RETRY, c, b, NULL,
1274                              "found bset signature after last bset");
1275
1276         sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
1277         sorted->keys.u64s = 0;
1278
1279         set_btree_bset(b, b->set, &b->data->keys);
1280
1281         b->nr = btree_node_is_extents(b)
1282                 ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1283                 : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
1284
1285         u64s = le16_to_cpu(sorted->keys.u64s);
1286         *sorted = *b->data;
1287         sorted->keys.u64s = cpu_to_le16(u64s);
1288         swap(sorted, b->data);
1289         set_btree_bset(b, b->set, &b->data->keys);
1290         b->nsets = 1;
1291
1292         BUG_ON(b->nr.live_u64s != u64s);
1293
1294         btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
1295
1296         i = &b->data->keys;
1297         for (k = i->start; k != vstruct_last(i);) {
1298                 enum bkey_type type = btree_node_type(b);
1299                 struct bkey tmp;
1300                 struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
1301                 const char *invalid = bch2_bkey_val_invalid(c, type, u);
1302
1303                 if (invalid ||
1304                     (inject_invalid_keys(c) &&
1305                      !bversion_cmp(u.k->version, MAX_VERSION))) {
1306                         char buf[160];
1307
1308                         bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
1309                         btree_err(BTREE_ERR_FIXABLE, c, b, i,
1310                                   "invalid bkey %s: %s", buf, invalid);
1311
1312                         btree_keys_account_key_drop(&b->nr, 0, k);
1313
1314                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1315                         memmove_u64s_down(k, bkey_next(k),
1316                                           (u64 *) vstruct_end(i) - (u64 *) k);
1317                         set_btree_bset_end(b, b->set);
1318                         continue;
1319                 }
1320
1321                 k = bkey_next(k);
1322         }
1323
1324         bch2_bset_build_aux_tree(b, b->set, false);
1325
1326         set_needs_whiteout(btree_bset_first(b));
1327
1328         btree_node_reset_sib_u64s(b);
1329 out:
1330         mempool_free(iter, &c->fill_iter);
1331         return retry_read;
1332 err:
1333 fsck_err:
1334         if (ret == BTREE_RETRY_READ) {
1335                 retry_read = 1;
1336         } else {
1337                 bch2_inconsistent_error(c);
1338                 set_btree_node_read_error(b);
1339         }
1340         goto out;
1341 }
1342
1343 static void btree_node_read_work(struct work_struct *work)
1344 {
1345         struct btree_read_bio *rb =
1346                 container_of(work, struct btree_read_bio, work);
1347         struct bch_fs *c        = rb->c;
1348         struct bch_dev *ca      = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1349         struct btree *b         = rb->bio.bi_private;
1350         struct bio *bio         = &rb->bio;
1351         struct bch_devs_mask avoid;
1352         bool can_retry;
1353
1354         memset(&avoid, 0, sizeof(avoid));
1355
1356         goto start;
1357         while (1) {
1358                 bch_info(c, "retrying read");
1359                 ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1360                 rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
1361                 bio_reset(bio);
1362                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1363                 bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
1364                 bio->bi_iter.bi_size    = btree_bytes(c);
1365
1366                 if (rb->have_ioref) {
1367                         bio_set_dev(bio, ca->disk_sb.bdev);
1368                         submit_bio_wait(bio);
1369                 } else {
1370                         bio->bi_status = BLK_STS_REMOVED;
1371                 }
1372 start:
1373                 bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
1374                 if (rb->have_ioref)
1375                         percpu_ref_put(&ca->io_ref);
1376                 rb->have_ioref = false;
1377
1378                 __set_bit(rb->pick.ptr.dev, avoid.d);
1379                 can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
1380
1381                 if (!bio->bi_status &&
1382                     !bch2_btree_node_read_done(c, b, can_retry))
1383                         break;
1384
1385                 if (!can_retry) {
1386                         set_btree_node_read_error(b);
1387                         break;
1388                 }
1389         }
1390
1391         bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
1392         bio_put(&rb->bio);
1393         clear_btree_node_read_in_flight(b);
1394         wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1395 }
1396
1397 static void btree_node_read_endio(struct bio *bio)
1398 {
1399         struct btree_read_bio *rb =
1400                 container_of(bio, struct btree_read_bio, bio);
1401         struct bch_fs *c        = rb->c;
1402
1403         if (rb->have_ioref) {
1404                 struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
1405                 bch2_latency_acct(ca, rb->start_time, READ);
1406         }
1407
1408         queue_work(system_unbound_wq, &rb->work);
1409 }
1410
1411 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
1412                           bool sync)
1413 {
1414         struct extent_pick_ptr pick;
1415         struct btree_read_bio *rb;
1416         struct bch_dev *ca;
1417         struct bio *bio;
1418         int ret;
1419
1420         trace_btree_read(c, b);
1421
1422         ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
1423         if (bch2_fs_fatal_err_on(ret <= 0, c,
1424                         "btree node read error: no device to read from")) {
1425                 set_btree_node_read_error(b);
1426                 return;
1427         }
1428
1429         ca = bch_dev_bkey_exists(c, pick.ptr.dev);
1430
1431         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
1432         rb = container_of(bio, struct btree_read_bio, bio);
1433         rb->c                   = c;
1434         rb->start_time          = local_clock();
1435         rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
1436         rb->pick                = pick;
1437         INIT_WORK(&rb->work, btree_node_read_work);
1438         bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1439         bio->bi_iter.bi_sector  = pick.ptr.offset;
1440         bio->bi_iter.bi_size    = btree_bytes(c);
1441         bio->bi_end_io          = btree_node_read_endio;
1442         bio->bi_private         = b;
1443         bch2_bio_map(bio, b->data);
1444
1445         set_btree_node_read_in_flight(b);
1446
1447         if (rb->have_ioref) {
1448                 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
1449                              bio_sectors(bio));
1450                 bio_set_dev(bio, ca->disk_sb.bdev);
1451
1452                 if (sync) {
1453                         submit_bio_wait(bio);
1454
1455                         bio->bi_private = b;
1456                         btree_node_read_work(&rb->work);
1457                 } else {
1458                         submit_bio(bio);
1459                 }
1460         } else {
1461                 bio->bi_status = BLK_STS_REMOVED;
1462
1463                 if (sync)
1464                         btree_node_read_work(&rb->work);
1465                 else
1466                         queue_work(system_unbound_wq, &rb->work);
1467
1468         }
1469 }
1470
1471 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1472                         const struct bkey_i *k, unsigned level)
1473 {
1474         struct closure cl;
1475         struct btree *b;
1476         int ret;
1477
1478         closure_init_stack(&cl);
1479
1480         do {
1481                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1482                 closure_sync(&cl);
1483         } while (ret);
1484
1485         b = bch2_btree_node_mem_alloc(c);
1486         bch2_btree_cache_cannibalize_unlock(c);
1487
1488         BUG_ON(IS_ERR(b));
1489
1490         bkey_copy(&b->key, k);
1491         BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
1492
1493         bch2_btree_node_read(c, b, true);
1494
1495         if (btree_node_read_error(b)) {
1496                 bch2_btree_node_hash_remove(&c->btree_cache, b);
1497
1498                 mutex_lock(&c->btree_cache.lock);
1499                 list_move(&b->list, &c->btree_cache.freeable);
1500                 mutex_unlock(&c->btree_cache.lock);
1501
1502                 ret = -EIO;
1503                 goto err;
1504         }
1505
1506         bch2_btree_set_root_for_read(c, b);
1507 err:
1508         six_unlock_write(&b->lock);
1509         six_unlock_intent(&b->lock);
1510
1511         return ret;
1512 }
1513
1514 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1515                               struct btree_write *w)
1516 {
1517         unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
1518
1519         do {
1520                 old = new = v;
1521                 if (!(old & 1))
1522                         break;
1523
1524                 new &= ~1UL;
1525         } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
1526
1527         if (old & 1)
1528                 closure_put(&((struct btree_update *) new)->cl);
1529
1530         bch2_journal_pin_drop(&c->journal, &w->journal);
1531         closure_wake_up(&w->wait);
1532 }
1533
1534 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1535 {
1536         struct btree_write *w = btree_prev_write(b);
1537
1538         bch2_btree_complete_write(c, b, w);
1539         btree_node_io_unlock(b);
1540 }
1541
1542 static void bch2_btree_node_write_error(struct bch_fs *c,
1543                                         struct btree_write_bio *wbio)
1544 {
1545         struct btree *b         = wbio->wbio.bio.bi_private;
1546         __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1547         struct bkey_i_extent *new_key;
1548         struct bkey_s_extent e;
1549         struct bch_extent_ptr *ptr;
1550         struct btree_iter iter;
1551         int ret;
1552
1553         __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
1554                                BTREE_MAX_DEPTH,
1555                                b->level, BTREE_ITER_NODES);
1556 retry:
1557         ret = bch2_btree_iter_traverse(&iter);
1558         if (ret)
1559                 goto err;
1560
1561         /* has node been freed? */
1562         if (iter.l[b->level].b != b) {
1563                 /* node has been freed: */
1564                 BUG_ON(!btree_node_dying(b));
1565                 goto out;
1566         }
1567
1568         BUG_ON(!btree_node_hashed(b));
1569
1570         bkey_copy(&tmp.k, &b->key);
1571
1572         new_key = bkey_i_to_extent(&tmp.k);
1573         e = extent_i_to_s(new_key);
1574         extent_for_each_ptr_backwards(e, ptr)
1575                 if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
1576                         bch2_extent_drop_ptr(e, ptr);
1577
1578         if (!bch2_extent_nr_ptrs(e.c))
1579                 goto err;
1580
1581         ret = bch2_btree_node_update_key(c, &iter, b, new_key);
1582         if (ret == -EINTR)
1583                 goto retry;
1584         if (ret)
1585                 goto err;
1586 out:
1587         bch2_btree_iter_unlock(&iter);
1588         bio_put(&wbio->wbio.bio);
1589         btree_node_write_done(c, b);
1590         return;
1591 err:
1592         set_btree_node_noevict(b);
1593         bch2_fs_fatal_error(c, "fatal error writing btree node");
1594         goto out;
1595 }
1596
1597 void bch2_btree_write_error_work(struct work_struct *work)
1598 {
1599         struct bch_fs *c = container_of(work, struct bch_fs,
1600                                         btree_write_error_work);
1601         struct bio *bio;
1602
1603         while (1) {
1604                 spin_lock_irq(&c->btree_write_error_lock);
1605                 bio = bio_list_pop(&c->btree_write_error_list);
1606                 spin_unlock_irq(&c->btree_write_error_lock);
1607
1608                 if (!bio)
1609                         break;
1610
1611                 bch2_btree_node_write_error(c,
1612                         container_of(bio, struct btree_write_bio, wbio.bio));
1613         }
1614 }
1615
1616 static void btree_node_write_work(struct work_struct *work)
1617 {
1618         struct btree_write_bio *wbio =
1619                 container_of(work, struct btree_write_bio, work);
1620         struct bch_fs *c        = wbio->wbio.c;
1621         struct btree *b         = wbio->wbio.bio.bi_private;
1622
1623         btree_bounce_free(c,
1624                 wbio->wbio.order,
1625                 wbio->wbio.used_mempool,
1626                 wbio->data);
1627
1628         if (wbio->wbio.failed.nr) {
1629                 unsigned long flags;
1630
1631                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1632                 bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
1633                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1634
1635                 queue_work(c->wq, &c->btree_write_error_work);
1636                 return;
1637         }
1638
1639         bio_put(&wbio->wbio.bio);
1640         btree_node_write_done(c, b);
1641 }
1642
1643 static void btree_node_write_endio(struct bio *bio)
1644 {
1645         struct bch_write_bio *wbio      = to_wbio(bio);
1646         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
1647         struct bch_write_bio *orig      = parent ?: wbio;
1648         struct bch_fs *c                = wbio->c;
1649         struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
1650         unsigned long flags;
1651
1652         if (wbio->have_ioref)
1653                 bch2_latency_acct(ca, wbio->submit_time, WRITE);
1654
1655         if (bio->bi_status == BLK_STS_REMOVED ||
1656             bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
1657             bch2_meta_write_fault("btree")) {
1658                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1659                 bch2_dev_list_add_dev(&orig->failed, wbio->dev);
1660                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1661         }
1662
1663         if (wbio->have_ioref)
1664                 percpu_ref_put(&ca->io_ref);
1665
1666         if (parent) {
1667                 bio_put(bio);
1668                 bio_endio(&parent->bio);
1669         } else {
1670                 struct btree_write_bio *wb =
1671                         container_of(orig, struct btree_write_bio, wbio);
1672
1673                 INIT_WORK(&wb->work, btree_node_write_work);
1674                 queue_work(system_unbound_wq, &wb->work);
1675         }
1676 }
1677
1678 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1679                                    struct bset *i, unsigned sectors)
1680 {
1681         const struct bch_extent_ptr *ptr;
1682         unsigned whiteout_u64s = 0;
1683         int ret;
1684
1685         extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
1686                 break;
1687
1688         ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
1689         if (ret)
1690                 bch2_inconsistent_error(c);
1691
1692         return ret;
1693 }
1694
1695 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1696                             enum six_lock_type lock_type_held)
1697 {
1698         struct btree_write_bio *wbio;
1699         struct bset_tree *t;
1700         struct bset *i;
1701         struct btree_node *bn = NULL;
1702         struct btree_node_entry *bne = NULL;
1703         BKEY_PADDED(key) k;
1704         struct bkey_s_extent e;
1705         struct bch_extent_ptr *ptr;
1706         struct sort_iter sort_iter;
1707         struct nonce nonce;
1708         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1709         u64 seq = 0;
1710         bool used_mempool;
1711         unsigned long old, new;
1712         void *data;
1713
1714         if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
1715                 return;
1716
1717         /*
1718          * We may only have a read lock on the btree node - the dirty bit is our
1719          * "lock" against racing with other threads that may be trying to start
1720          * a write, we do a write iff we clear the dirty bit. Since setting the
1721          * dirty bit requires a write lock, we can't race with other threads
1722          * redirtying it:
1723          */
1724         do {
1725                 old = new = READ_ONCE(b->flags);
1726
1727                 if (!(old & (1 << BTREE_NODE_dirty)))
1728                         return;
1729
1730                 if (b->written &&
1731                     !btree_node_may_write(b))
1732                         return;
1733
1734                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1735                         btree_node_wait_on_io(b);
1736                         continue;
1737                 }
1738
1739                 new &= ~(1 << BTREE_NODE_dirty);
1740                 new &= ~(1 << BTREE_NODE_need_write);
1741                 new |=  (1 << BTREE_NODE_write_in_flight);
1742                 new |=  (1 << BTREE_NODE_just_written);
1743                 new ^=  (1 << BTREE_NODE_write_idx);
1744         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1745
1746         BUG_ON(btree_node_fake(b));
1747         BUG_ON(!list_empty(&b->write_blocked));
1748         BUG_ON((b->will_make_reachable != 0) != !b->written);
1749
1750         BUG_ON(b->written >= c->opts.btree_node_size);
1751         BUG_ON(b->written & (c->opts.block_size - 1));
1752         BUG_ON(bset_written(b, btree_bset_last(b)));
1753         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1754         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1755
1756         /*
1757          * We can't block on six_lock_write() here; another thread might be
1758          * trying to get a journal reservation with read locks held, and getting
1759          * a journal reservation might be blocked on flushing the journal and
1760          * doing btree writes:
1761          */
1762         if (lock_type_held == SIX_LOCK_intent &&
1763             six_trylock_write(&b->lock)) {
1764                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
1765                 six_unlock_write(&b->lock);
1766         } else {
1767                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1768         }
1769
1770         BUG_ON(b->uncompacted_whiteout_u64s);
1771
1772         sort_iter_init(&sort_iter, b);
1773
1774         bytes = !b->written
1775                 ? sizeof(struct btree_node)
1776                 : sizeof(struct btree_node_entry);
1777
1778         bytes += b->whiteout_u64s * sizeof(u64);
1779
1780         for_each_bset(b, t) {
1781                 i = bset(b, t);
1782
1783                 if (bset_written(b, i))
1784                         continue;
1785
1786                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1787                 sort_iter_add(&sort_iter,
1788                               btree_bkey_first(b, t),
1789                               btree_bkey_last(b, t));
1790                 seq = max(seq, le64_to_cpu(i->journal_seq));
1791         }
1792
1793         order = get_order(bytes);
1794         data = btree_bounce_alloc(c, order, &used_mempool);
1795
1796         if (!b->written) {
1797                 bn = data;
1798                 *bn = *b->data;
1799                 i = &bn->keys;
1800         } else {
1801                 bne = data;
1802                 bne->keys = b->data->keys;
1803                 i = &bne->keys;
1804         }
1805
1806         i->journal_seq  = cpu_to_le64(seq);
1807         i->u64s         = 0;
1808
1809         if (!btree_node_is_extents(b)) {
1810                 sort_iter_add(&sort_iter,
1811                               unwritten_whiteouts_start(c, b),
1812                               unwritten_whiteouts_end(c, b));
1813                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1814         } else {
1815                 memcpy_u64s(i->start,
1816                             unwritten_whiteouts_start(c, b),
1817                             b->whiteout_u64s);
1818                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1819                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1820         }
1821
1822         b->whiteout_u64s = 0;
1823
1824         u64s = btree_node_is_extents(b)
1825                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1826                 : sort_keys(i->start, &sort_iter, false);
1827         le16_add_cpu(&i->u64s, u64s);
1828
1829         clear_needs_whiteout(i);
1830
1831         /* do we have data to write? */
1832         if (b->written && !i->u64s)
1833                 goto nowrite;
1834
1835         bytes_to_write = vstruct_end(i) - data;
1836         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1837
1838         memset(data + bytes_to_write, 0,
1839                (sectors_to_write << 9) - bytes_to_write);
1840
1841         BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
1842         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1843         BUG_ON(i->seq != b->data->keys.seq);
1844
1845         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1846         SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
1847
1848         /* if we're going to be encrypting, check metadata validity first: */
1849         if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1850             validate_bset_for_write(c, b, i, sectors_to_write))
1851                 goto err;
1852
1853         bset_encrypt(c, i, b->written << 9);
1854
1855         nonce = btree_nonce(i, b->written << 9);
1856
1857         if (bn)
1858                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1859         else
1860                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1861
1862         /* if we're not encrypting, check metadata after checksumming: */
1863         if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1864             validate_bset_for_write(c, b, i, sectors_to_write))
1865                 goto err;
1866
1867         /*
1868          * We handle btree write errors by immediately halting the journal -
1869          * after we've done that, we can't issue any subsequent btree writes
1870          * because they might have pointers to new nodes that failed to write.
1871          *
1872          * Furthermore, there's no point in doing any more btree writes because
1873          * with the journal stopped, we're never going to update the journal to
1874          * reflect that those writes were done and the data flushed from the
1875          * journal:
1876          *
1877          * Make sure to update b->written so bch2_btree_init_next() doesn't
1878          * break:
1879          */
1880         if (bch2_journal_error(&c->journal) ||
1881             c->opts.nochanges)
1882                 goto err;
1883
1884         trace_btree_write(b, bytes_to_write, sectors_to_write);
1885
1886         wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
1887                             struct btree_write_bio, wbio.bio);
1888         wbio_init(&wbio->wbio.bio);
1889         wbio->data                      = data;
1890         wbio->wbio.order                = order;
1891         wbio->wbio.used_mempool         = used_mempool;
1892         wbio->wbio.bio.bi_opf           = REQ_OP_WRITE|REQ_META|REQ_FUA;
1893         wbio->wbio.bio.bi_iter.bi_size  = sectors_to_write << 9;
1894         wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
1895         wbio->wbio.bio.bi_private       = b;
1896
1897         bch2_bio_map(&wbio->wbio.bio, data);
1898
1899         /*
1900          * If we're appending to a leaf node, we don't technically need FUA -
1901          * this write just needs to be persisted before the next journal write,
1902          * which will be marked FLUSH|FUA.
1903          *
1904          * Similarly if we're writing a new btree root - the pointer is going to
1905          * be in the next journal entry.
1906          *
1907          * But if we're writing a new btree node (that isn't a root) or
1908          * appending to a non leaf btree node, we need either FUA or a flush
1909          * when we write the parent with the new pointer. FUA is cheaper than a
1910          * flush, and writes appending to leaf nodes aren't blocking anything so
1911          * just make all btree node writes FUA to keep things sane.
1912          */
1913
1914         bkey_copy(&k.key, &b->key);
1915         e = bkey_i_to_s_extent(&k.key);
1916
1917         extent_for_each_ptr(e, ptr)
1918                 ptr->offset += b->written;
1919
1920         b->written += sectors_to_write;
1921
1922         bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
1923         return;
1924 err:
1925         set_btree_node_noevict(b);
1926         b->written += sectors_to_write;
1927 nowrite:
1928         btree_bounce_free(c, order, used_mempool, data);
1929         btree_node_write_done(c, b);
1930 }
1931
1932 /*
1933  * Work that must be done with write lock held:
1934  */
1935 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
1936 {
1937         bool invalidated_iter = false;
1938         struct btree_node_entry *bne;
1939         struct bset_tree *t;
1940
1941         if (!btree_node_just_written(b))
1942                 return false;
1943
1944         BUG_ON(b->whiteout_u64s);
1945         BUG_ON(b->uncompacted_whiteout_u64s);
1946
1947         clear_btree_node_just_written(b);
1948
1949         /*
1950          * Note: immediately after write, bset_written() doesn't work - the
1951          * amount of data we had to write after compaction might have been
1952          * smaller than the offset of the last bset.
1953          *
1954          * However, we know that all bsets have been written here, as long as
1955          * we're still holding the write lock:
1956          */
1957
1958         /*
1959          * XXX: decide if we really want to unconditionally sort down to a
1960          * single bset:
1961          */
1962         if (b->nsets > 1) {
1963                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1964                 invalidated_iter = true;
1965         } else {
1966                 invalidated_iter = bch2_drop_whiteouts(b);
1967         }
1968
1969         for_each_bset(b, t)
1970                 set_needs_whiteout(bset(b, t));
1971
1972         bch2_btree_verify(c, b);
1973
1974         /*
1975          * If later we don't unconditionally sort down to a single bset, we have
1976          * to ensure this is still true:
1977          */
1978         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1979
1980         bne = want_new_bset(c, b);
1981         if (bne)
1982                 bch2_bset_init_next(c, b, bne);
1983
1984         bch2_btree_build_aux_trees(b);
1985
1986         return invalidated_iter;
1987 }
1988
1989 /*
1990  * Use this one if the node is intent locked:
1991  */
1992 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1993                           enum six_lock_type lock_type_held)
1994 {
1995         BUG_ON(lock_type_held == SIX_LOCK_write);
1996
1997         if (lock_type_held == SIX_LOCK_intent ||
1998             six_lock_tryupgrade(&b->lock)) {
1999                 __bch2_btree_node_write(c, b, SIX_LOCK_intent);
2000
2001                 /* don't cycle lock unnecessarily: */
2002                 if (btree_node_just_written(b) &&
2003                     six_trylock_write(&b->lock)) {
2004                         bch2_btree_post_write_cleanup(c, b);
2005                         six_unlock_write(&b->lock);
2006                 }
2007
2008                 if (lock_type_held == SIX_LOCK_read)
2009                         six_lock_downgrade(&b->lock);
2010         } else {
2011                 __bch2_btree_node_write(c, b, SIX_LOCK_read);
2012         }
2013 }
2014
2015 static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
2016 {
2017         struct bucket_table *tbl;
2018         struct rhash_head *pos;
2019         struct btree *b;
2020         unsigned i;
2021 restart:
2022         rcu_read_lock();
2023         for_each_cached_btree(b, c, tbl, i, pos)
2024                 if (test_bit(flag, &b->flags)) {
2025                         rcu_read_unlock();
2026                         wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
2027                         goto restart;
2028
2029                 }
2030         rcu_read_unlock();
2031 }
2032
2033 void bch2_btree_flush_all_reads(struct bch_fs *c)
2034 {
2035         __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
2036 }
2037
2038 void bch2_btree_flush_all_writes(struct bch_fs *c)
2039 {
2040         __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
2041 }
2042
2043 void bch2_btree_verify_flushed(struct bch_fs *c)
2044 {
2045         struct bucket_table *tbl;
2046         struct rhash_head *pos;
2047         struct btree *b;
2048         unsigned i;
2049
2050         rcu_read_lock();
2051         for_each_cached_btree(b, c, tbl, i, pos) {
2052                 unsigned long flags = READ_ONCE(b->flags);
2053
2054                 BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
2055                        (flags & (1 << BTREE_NODE_write_in_flight)));
2056         }
2057         rcu_read_unlock();
2058 }
2059
2060 ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
2061 {
2062         char *out = buf, *end = buf + PAGE_SIZE;
2063         struct bucket_table *tbl;
2064         struct rhash_head *pos;
2065         struct btree *b;
2066         unsigned i;
2067
2068         rcu_read_lock();
2069         for_each_cached_btree(b, c, tbl, i, pos) {
2070                 unsigned long flags = READ_ONCE(b->flags);
2071                 unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
2072
2073                 if (//!(flags & (1 << BTREE_NODE_dirty)) &&
2074                     !b->writes[0].wait.list.first &&
2075                     !b->writes[1].wait.list.first &&
2076                     !(b->will_make_reachable & 1))
2077                         continue;
2078
2079                 out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
2080                                  b,
2081                                  (flags & (1 << BTREE_NODE_dirty)) != 0,
2082                                  b->level,
2083                                  b->written,
2084                                  !list_empty_careful(&b->write_blocked),
2085                                  b->will_make_reachable != 0,
2086                                  b->will_make_reachable & 1,
2087                                  b->writes[ idx].wait.list.first != NULL,
2088                                  b->writes[!idx].wait.list.first != NULL);
2089         }
2090         rcu_read_unlock();
2091
2092         return out - buf;
2093 }