]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_io.c
Rename from bcache-tools to bcachefs-tools
[bcachefs-tools-debian] / libbcachefs / btree_io.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_update.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "buckets.h"
10 #include "checksum.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "io.h"
15 #include "journal.h"
16 #include "super-io.h"
17
18 #include <trace/events/bcachefs.h>
19
20 static void verify_no_dups(struct btree *b,
21                            struct bkey_packed *start,
22                            struct bkey_packed *end)
23 {
24 #ifdef CONFIG_BCACHEFS_DEBUG
25         struct bkey_packed *k;
26
27         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
28                 struct bkey l = bkey_unpack_key(b, k);
29                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
30
31                 BUG_ON(btree_node_is_extents(b)
32                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
33                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
34                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
35         }
36 #endif
37 }
38
39 static void clear_needs_whiteout(struct bset *i)
40 {
41         struct bkey_packed *k;
42
43         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
44                 k->needs_whiteout = false;
45 }
46
47 static void set_needs_whiteout(struct bset *i)
48 {
49         struct bkey_packed *k;
50
51         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
52                 k->needs_whiteout = true;
53 }
54
55 static void btree_bounce_free(struct bch_fs *c, unsigned order,
56                               bool used_mempool, void *p)
57 {
58         if (used_mempool)
59                 mempool_free(virt_to_page(p), &c->btree_bounce_pool);
60         else
61                 free_pages((unsigned long) p, order);
62 }
63
64 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
65                                 bool *used_mempool)
66 {
67         void *p;
68
69         BUG_ON(1 << order > btree_pages(c));
70
71         *used_mempool = false;
72         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
73         if (p)
74                 return p;
75
76         *used_mempool = true;
77         return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
78 }
79
80 typedef int (*sort_cmp_fn)(struct btree *,
81                            struct bkey_packed *,
82                            struct bkey_packed *);
83
84 struct sort_iter {
85         struct btree    *b;
86         unsigned                used;
87
88         struct sort_iter_set {
89                 struct bkey_packed *k, *end;
90         } data[MAX_BSETS + 1];
91 };
92
93 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
94 {
95         memset(iter, 0, sizeof(*iter));
96         iter->b = b;
97 }
98
99 static inline void __sort_iter_sift(struct sort_iter *iter,
100                                     unsigned from,
101                                     sort_cmp_fn cmp)
102 {
103         unsigned i;
104
105         for (i = from;
106              i + 1 < iter->used &&
107              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
108              i++)
109                 swap(iter->data[i], iter->data[i + 1]);
110 }
111
112 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
113 {
114
115         __sort_iter_sift(iter, 0, cmp);
116 }
117
118 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
119 {
120         unsigned i = iter->used;
121
122         while (i--)
123                 __sort_iter_sift(iter, i, cmp);
124 }
125
126 static void sort_iter_add(struct sort_iter *iter,
127                           struct bkey_packed *k,
128                           struct bkey_packed *end)
129 {
130         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
131
132         if (k != end)
133                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
134 }
135
136 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
137 {
138         return iter->used ? iter->data->k : NULL;
139 }
140
141 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
142 {
143         iter->data->k = bkey_next(iter->data->k);
144
145         BUG_ON(iter->data->k > iter->data->end);
146
147         if (iter->data->k == iter->data->end)
148                 memmove(&iter->data[0],
149                         &iter->data[1],
150                         sizeof(iter->data[0]) * --iter->used);
151         else
152                 sort_iter_sift(iter, cmp);
153 }
154
155 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
156                                                  sort_cmp_fn cmp)
157 {
158         struct bkey_packed *ret = sort_iter_peek(iter);
159
160         if (ret)
161                 sort_iter_advance(iter, cmp);
162
163         return ret;
164 }
165
166 static inline int sort_key_whiteouts_cmp(struct btree *b,
167                                          struct bkey_packed *l,
168                                          struct bkey_packed *r)
169 {
170         return bkey_cmp_packed(b, l, r);
171 }
172
173 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
174                                    struct sort_iter *iter)
175 {
176         struct bkey_packed *in, *out = dst;
177
178         sort_iter_sort(iter, sort_key_whiteouts_cmp);
179
180         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
181                 bkey_copy(out, in);
182                 out = bkey_next(out);
183         }
184
185         return (u64 *) out - (u64 *) dst;
186 }
187
188 static inline int sort_extent_whiteouts_cmp(struct btree *b,
189                                             struct bkey_packed *l,
190                                             struct bkey_packed *r)
191 {
192         struct bkey ul = bkey_unpack_key(b, l);
193         struct bkey ur = bkey_unpack_key(b, r);
194
195         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
196 }
197
198 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
199                                       struct sort_iter *iter)
200 {
201         const struct bkey_format *f = &iter->b->format;
202         struct bkey_packed *in, *out = dst;
203         struct bkey_i l, r;
204         bool prev = false, l_packed = false;
205         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
206         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
207         u64 new_size;
208
209         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
210
211         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
212
213         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
214                 EBUG_ON(bkeyp_val_u64s(f, in));
215                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
216
217                 r.k = bkey_unpack_key(iter->b, in);
218
219                 if (prev &&
220                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
221                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
222                                 continue;
223
224                         new_size = l_packed
225                                 ? min(max_packed_size, max_packed_offset -
226                                       bkey_start_offset(&l.k))
227                                 : KEY_SIZE_MAX;
228
229                         new_size = min(new_size, r.k.p.offset -
230                                        bkey_start_offset(&l.k));
231
232                         BUG_ON(new_size < l.k.size);
233
234                         bch2_key_resize(&l.k, new_size);
235
236                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
237                                 continue;
238
239                         bch2_cut_front(l.k.p, &r);
240                 }
241
242                 if (prev) {
243                         if (!bch2_bkey_pack(out, &l, f)) {
244                                 BUG_ON(l_packed);
245                                 bkey_copy(out, &l);
246                         }
247                         out = bkey_next(out);
248                 }
249
250                 l = r;
251                 prev = true;
252                 l_packed = bkey_packed(in);
253         }
254
255         if (prev) {
256                 if (!bch2_bkey_pack(out, &l, f)) {
257                         BUG_ON(l_packed);
258                         bkey_copy(out, &l);
259                 }
260                 out = bkey_next(out);
261         }
262
263         return (u64 *) out - (u64 *) dst;
264 }
265
266 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
267                                     bool compacting,
268                                     enum compact_mode mode)
269 {
270         unsigned live_u64s = b->nr.bset_u64s[t - b->set];
271         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
272
273         if (live_u64s == bset_u64s)
274                 return 0;
275
276         if (mode == COMPACT_LAZY) {
277                 if (live_u64s * 4 < bset_u64s * 3 ||
278                     (compacting && bset_unwritten(b, bset(b, t))))
279                         return bset_u64s - live_u64s;
280         } else {
281                 if (bset_written(b, bset(b, t)))
282                         return bset_u64s - live_u64s;
283         }
284
285         return 0;
286 }
287
288 bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
289                              enum compact_mode mode)
290 {
291         const struct bkey_format *f = &b->format;
292         struct bset_tree *t;
293         struct bkey_packed *whiteouts = NULL;
294         struct bkey_packed *u_start, *u_pos;
295         struct sort_iter sort_iter;
296         unsigned order, whiteout_u64s = 0, u64s;
297         bool used_mempool, compacting = false;
298
299         for_each_bset(b, t)
300                 whiteout_u64s += should_compact_bset(b, t,
301                                         whiteout_u64s != 0, mode);
302
303         if (!whiteout_u64s)
304                 return false;
305
306         sort_iter_init(&sort_iter, b);
307
308         whiteout_u64s += b->whiteout_u64s;
309         order = get_order(whiteout_u64s * sizeof(u64));
310
311         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
312         u_start = u_pos = whiteouts;
313
314         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
315                     b->whiteout_u64s);
316         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
317
318         sort_iter_add(&sort_iter, u_start, u_pos);
319
320         for_each_bset(b, t) {
321                 struct bset *i = bset(b, t);
322                 struct bkey_packed *k, *n, *out, *start, *end;
323                 struct btree_node_entry *src = NULL, *dst = NULL;
324
325                 if (t != b->set && bset_unwritten(b, i)) {
326                         src = container_of(i, struct btree_node_entry, keys);
327                         dst = max(write_block(b),
328                                   (void *) btree_bkey_last(b, t -1));
329                 }
330
331                 if (!should_compact_bset(b, t, compacting, mode)) {
332                         if (src != dst) {
333                                 memmove(dst, src, sizeof(*src) +
334                                         le16_to_cpu(src->keys.u64s) *
335                                         sizeof(u64));
336                                 i = &dst->keys;
337                                 set_btree_bset(b, t, i);
338                         }
339                         continue;
340                 }
341
342                 compacting = true;
343                 u_start = u_pos;
344                 start = i->start;
345                 end = vstruct_last(i);
346
347                 if (src != dst) {
348                         memmove(dst, src, sizeof(*src));
349                         i = &dst->keys;
350                         set_btree_bset(b, t, i);
351                 }
352
353                 out = i->start;
354
355                 for (k = start; k != end; k = n) {
356                         n = bkey_next(k);
357
358                         if (bkey_deleted(k) && btree_node_is_extents(b))
359                                 continue;
360
361                         if (bkey_whiteout(k) && !k->needs_whiteout)
362                                 continue;
363
364                         if (bkey_whiteout(k)) {
365                                 unreserve_whiteout(b, t, k);
366                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
367                                 set_bkeyp_val_u64s(f, u_pos, 0);
368                                 u_pos = bkey_next(u_pos);
369                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
370                                 bkey_copy(out, k);
371                                 out = bkey_next(out);
372                         }
373                 }
374
375                 sort_iter_add(&sort_iter, u_start, u_pos);
376
377                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
378                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
379                         set_btree_bset_end(b, t);
380                         bch2_bset_set_no_aux_tree(b, t);
381                 }
382         }
383
384         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
385
386         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
387                (void *) btree_bkey_last(b, bset_tree_last(b)));
388
389         u64s = btree_node_is_extents(b)
390                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
391                                         &sort_iter)
392                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
393                                      &sort_iter);
394
395         BUG_ON(u64s > b->whiteout_u64s);
396         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
397         BUG_ON(u_pos != whiteouts && !u64s);
398
399         if (u64s != b->whiteout_u64s) {
400                 void *src = unwritten_whiteouts_start(c, b);
401
402                 b->whiteout_u64s = u64s;
403                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
404         }
405
406         verify_no_dups(b,
407                        unwritten_whiteouts_start(c, b),
408                        unwritten_whiteouts_end(c, b));
409
410         btree_bounce_free(c, order, used_mempool, whiteouts);
411
412         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
413                 bch2_btree_build_aux_trees(b);
414
415         bch_btree_keys_u64s_remaining(c, b);
416         bch2_verify_btree_nr_keys(b);
417
418         return true;
419 }
420
421 static bool bch2_drop_whiteouts(struct btree *b)
422 {
423         struct bset_tree *t;
424         bool ret = false;
425
426         for_each_bset(b, t) {
427                 struct bset *i = bset(b, t);
428                 struct bkey_packed *k, *n, *out, *start, *end;
429
430                 if (!should_compact_bset(b, t, true, true))
431                         continue;
432
433                 start   = btree_bkey_first(b, t);
434                 end     = btree_bkey_last(b, t);
435
436                 if (bset_unwritten(b, i) &&
437                     t != b->set) {
438                         struct bset *dst =
439                                max_t(struct bset *, write_block(b),
440                                      (void *) btree_bkey_last(b, t -1));
441
442                         memmove(dst, i, sizeof(struct bset));
443                         i = dst;
444                         set_btree_bset(b, t, i);
445                 }
446
447                 out = i->start;
448
449                 for (k = start; k != end; k = n) {
450                         n = bkey_next(k);
451
452                         if (!bkey_whiteout(k)) {
453                                 bkey_copy(out, k);
454                                 out = bkey_next(out);
455                         }
456                 }
457
458                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
459                 bch2_bset_set_no_aux_tree(b, t);
460                 ret = true;
461         }
462
463         bch2_verify_btree_nr_keys(b);
464
465         return ret;
466 }
467
468 static inline int sort_keys_cmp(struct btree *b,
469                                 struct bkey_packed *l,
470                                 struct bkey_packed *r)
471 {
472         return bkey_cmp_packed(b, l, r) ?:
473                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
474                 (int) l->needs_whiteout - (int) r->needs_whiteout;
475 }
476
477 static unsigned sort_keys(struct bkey_packed *dst,
478                           struct sort_iter *iter,
479                           bool filter_whiteouts)
480 {
481         const struct bkey_format *f = &iter->b->format;
482         struct bkey_packed *in, *next, *out = dst;
483
484         sort_iter_sort(iter, sort_keys_cmp);
485
486         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
487                 if (bkey_whiteout(in) &&
488                     (filter_whiteouts || !in->needs_whiteout))
489                         continue;
490
491                 if (bkey_whiteout(in) &&
492                     (next = sort_iter_peek(iter)) &&
493                     !bkey_cmp_packed(iter->b, in, next)) {
494                         BUG_ON(in->needs_whiteout &&
495                                next->needs_whiteout);
496                         /*
497                          * XXX racy, called with read lock from write path
498                          *
499                          * leads to spurious BUG_ON() in bkey_unpack_key() in
500                          * debug mode
501                          */
502                         next->needs_whiteout |= in->needs_whiteout;
503                         continue;
504                 }
505
506                 if (bkey_whiteout(in)) {
507                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
508                         set_bkeyp_val_u64s(f, out, 0);
509                 } else {
510                         bkey_copy(out, in);
511                 }
512                 out = bkey_next(out);
513         }
514
515         return (u64 *) out - (u64 *) dst;
516 }
517
518 static inline int sort_extents_cmp(struct btree *b,
519                                    struct bkey_packed *l,
520                                    struct bkey_packed *r)
521 {
522         return bkey_cmp_packed(b, l, r) ?:
523                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
524 }
525
526 static unsigned sort_extents(struct bkey_packed *dst,
527                              struct sort_iter *iter,
528                              bool filter_whiteouts)
529 {
530         struct bkey_packed *in, *out = dst;
531
532         sort_iter_sort(iter, sort_extents_cmp);
533
534         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
535                 if (bkey_deleted(in))
536                         continue;
537
538                 if (bkey_whiteout(in) &&
539                     (filter_whiteouts || !in->needs_whiteout))
540                         continue;
541
542                 bkey_copy(out, in);
543                 out = bkey_next(out);
544         }
545
546         return (u64 *) out - (u64 *) dst;
547 }
548
549 static void btree_node_sort(struct bch_fs *c, struct btree *b,
550                             struct btree_iter *iter,
551                             unsigned start_idx,
552                             unsigned end_idx,
553                             bool filter_whiteouts)
554 {
555         struct btree_node *out;
556         struct sort_iter sort_iter;
557         struct bset_tree *t;
558         struct bset *start_bset = bset(b, &b->set[start_idx]);
559         bool used_mempool = false;
560         u64 start_time;
561         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
562         bool sorting_entire_node = start_idx == 0 &&
563                 end_idx == b->nsets;
564
565         sort_iter_init(&sort_iter, b);
566
567         for (t = b->set + start_idx;
568              t < b->set + end_idx;
569              t++) {
570                 u64s += le16_to_cpu(bset(b, t)->u64s);
571                 sort_iter_add(&sort_iter,
572                               btree_bkey_first(b, t),
573                               btree_bkey_last(b, t));
574         }
575
576         order = sorting_entire_node
577                 ? btree_page_order(c)
578                 : get_order(__vstruct_bytes(struct btree_node, u64s));
579
580         out = btree_bounce_alloc(c, order, &used_mempool);
581
582         start_time = local_clock();
583
584         if (btree_node_is_extents(b))
585                 filter_whiteouts = bset_written(b, start_bset);
586
587         u64s = btree_node_is_extents(b)
588                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
589                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
590
591         out->keys.u64s = cpu_to_le16(u64s);
592
593         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
594
595         if (sorting_entire_node)
596                 bch2_time_stats_update(&c->btree_sort_time, start_time);
597
598         /* Make sure we preserve bset journal_seq: */
599         for (t = b->set + start_idx + 1;
600              t < b->set + end_idx;
601              t++)
602                 start_bset->journal_seq =
603                         max(start_bset->journal_seq,
604                             bset(b, t)->journal_seq);
605
606         if (sorting_entire_node) {
607                 unsigned u64s = le16_to_cpu(out->keys.u64s);
608
609                 BUG_ON(order != btree_page_order(c));
610
611                 /*
612                  * Our temporary buffer is the same size as the btree node's
613                  * buffer, we can just swap buffers instead of doing a big
614                  * memcpy()
615                  */
616                 *out = *b->data;
617                 out->keys.u64s = cpu_to_le16(u64s);
618                 swap(out, b->data);
619                 set_btree_bset(b, b->set, &b->data->keys);
620         } else {
621                 start_bset->u64s = out->keys.u64s;
622                 memcpy_u64s(start_bset->start,
623                             out->keys.start,
624                             le16_to_cpu(out->keys.u64s));
625         }
626
627         for (i = start_idx + 1; i < end_idx; i++)
628                 b->nr.bset_u64s[start_idx] +=
629                         b->nr.bset_u64s[i];
630
631         b->nsets -= shift;
632
633         for (i = start_idx + 1; i < b->nsets; i++) {
634                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
635                 b->set[i]               = b->set[i + shift];
636         }
637
638         for (i = b->nsets; i < MAX_BSETS; i++)
639                 b->nr.bset_u64s[i] = 0;
640
641         set_btree_bset_end(b, &b->set[start_idx]);
642         bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
643
644         btree_bounce_free(c, order, used_mempool, out);
645
646         bch2_verify_btree_nr_keys(b);
647 }
648
649 /* Sort + repack in a new format: */
650 static struct btree_nr_keys sort_repack(struct bset *dst,
651                                         struct btree *src,
652                                         struct btree_node_iter *src_iter,
653                                         struct bkey_format *out_f,
654                                         bool filter_whiteouts)
655 {
656         struct bkey_format *in_f = &src->format;
657         struct bkey_packed *in, *out = vstruct_last(dst);
658         struct btree_nr_keys nr;
659
660         memset(&nr, 0, sizeof(nr));
661
662         while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
663                 if (filter_whiteouts && bkey_whiteout(in))
664                         continue;
665
666                 if (bch2_bkey_transform(out_f, out, bkey_packed(in)
667                                        ? in_f : &bch2_bkey_format_current, in))
668                         out->format = KEY_FORMAT_LOCAL_BTREE;
669                 else
670                         bch2_bkey_unpack(src, (void *) out, in);
671
672                 btree_keys_account_key_add(&nr, 0, out);
673                 out = bkey_next(out);
674         }
675
676         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
677         return nr;
678 }
679
680 /* Sort, repack, and merge: */
681 static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
682                                               struct bset *dst,
683                                               struct btree *src,
684                                               struct btree_node_iter *iter,
685                                               struct bkey_format *out_f,
686                                               bool filter_whiteouts,
687                                               key_filter_fn filter,
688                                               key_merge_fn merge)
689 {
690         struct bkey_packed *k, *prev = NULL, *out;
691         struct btree_nr_keys nr;
692         BKEY_PADDED(k) tmp;
693
694         memset(&nr, 0, sizeof(nr));
695
696         while ((k = bch2_btree_node_iter_next_all(iter, src))) {
697                 if (filter_whiteouts && bkey_whiteout(k))
698                         continue;
699
700                 /*
701                  * The filter might modify pointers, so we have to unpack the
702                  * key and values to &tmp.k:
703                  */
704                 bch2_bkey_unpack(src, &tmp.k, k);
705
706                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
707                         continue;
708
709                 /* prev is always unpacked, for key merging: */
710
711                 if (prev &&
712                     merge &&
713                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
714                         continue;
715
716                 /*
717                  * the current key becomes the new prev: advance prev, then
718                  * copy the current key - but first pack prev (in place):
719                  */
720                 if (prev) {
721                         bch2_bkey_pack(prev, (void *) prev, out_f);
722
723                         btree_keys_account_key_add(&nr, 0, prev);
724                         prev = bkey_next(prev);
725                 } else {
726                         prev = vstruct_last(dst);
727                 }
728
729                 bkey_copy(prev, &tmp.k);
730         }
731
732         if (prev) {
733                 bch2_bkey_pack(prev, (void *) prev, out_f);
734                 btree_keys_account_key_add(&nr, 0, prev);
735                 out = bkey_next(prev);
736         } else {
737                 out = vstruct_last(dst);
738         }
739
740         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
741         return nr;
742 }
743
744 void bch2_btree_sort_into(struct bch_fs *c,
745                          struct btree *dst,
746                          struct btree *src)
747 {
748         struct btree_nr_keys nr;
749         struct btree_node_iter src_iter;
750         u64 start_time = local_clock();
751
752         BUG_ON(dst->nsets != 1);
753
754         bch2_bset_set_no_aux_tree(dst, dst->set);
755
756         bch2_btree_node_iter_init_from_start(&src_iter, src,
757                                             btree_node_is_extents(src));
758
759         if (btree_node_ops(src)->key_normalize ||
760             btree_node_ops(src)->key_merge)
761                 nr = sort_repack_merge(c, btree_bset_first(dst),
762                                 src, &src_iter,
763                                 &dst->format,
764                                 true,
765                                 btree_node_ops(src)->key_normalize,
766                                 btree_node_ops(src)->key_merge);
767         else
768                 nr = sort_repack(btree_bset_first(dst),
769                                 src, &src_iter,
770                                 &dst->format,
771                                 true);
772
773         bch2_time_stats_update(&c->btree_sort_time, start_time);
774
775         set_btree_bset_end(dst, dst->set);
776
777         dst->nr.live_u64s       += nr.live_u64s;
778         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
779         dst->nr.packed_keys     += nr.packed_keys;
780         dst->nr.unpacked_keys   += nr.unpacked_keys;
781
782         bch2_verify_btree_nr_keys(dst);
783 }
784
785 #define SORT_CRIT       (4096 / sizeof(u64))
786
787 /*
788  * We're about to add another bset to the btree node, so if there's currently
789  * too many bsets - sort some of them together:
790  */
791 static bool btree_node_compact(struct bch_fs *c, struct btree *b,
792                                struct btree_iter *iter)
793 {
794         unsigned unwritten_idx;
795         bool ret = false;
796
797         for (unwritten_idx = 0;
798              unwritten_idx < b->nsets;
799              unwritten_idx++)
800                 if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
801                         break;
802
803         if (b->nsets - unwritten_idx > 1) {
804                 btree_node_sort(c, b, iter, unwritten_idx,
805                                 b->nsets, false);
806                 ret = true;
807         }
808
809         if (unwritten_idx > 1) {
810                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
811                 ret = true;
812         }
813
814         return ret;
815 }
816
817 void bch2_btree_build_aux_trees(struct btree *b)
818 {
819         struct bset_tree *t;
820
821         for_each_bset(b, t)
822                 bch2_bset_build_aux_tree(b, t,
823                                 bset_unwritten(b, bset(b, t)) &&
824                                 t == bset_tree_last(b));
825 }
826
827 /*
828  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
829  * inserted into
830  *
831  * Safe to call if there already is an unwritten bset - will only add a new bset
832  * if @b doesn't already have one.
833  *
834  * Returns true if we sorted (i.e. invalidated iterators
835  */
836 void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
837                          struct btree_iter *iter)
838 {
839         struct btree_node_entry *bne;
840         bool did_sort;
841
842         EBUG_ON(!(b->lock.state.seq & 1));
843         EBUG_ON(iter && iter->nodes[b->level] != b);
844
845         did_sort = btree_node_compact(c, b, iter);
846
847         bne = want_new_bset(c, b);
848         if (bne)
849                 bch2_bset_init_next(b, &bne->keys);
850
851         bch2_btree_build_aux_trees(b);
852
853         if (iter && did_sort)
854                 bch2_btree_iter_reinit_node(iter, b);
855 }
856
857 static struct nonce btree_nonce(struct btree *b,
858                                 struct bset *i,
859                                 unsigned offset)
860 {
861         return (struct nonce) {{
862                 [0] = cpu_to_le32(offset),
863                 [1] = ((__le32 *) &i->seq)[0],
864                 [2] = ((__le32 *) &i->seq)[1],
865                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
866         }};
867 }
868
869 static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
870 {
871         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
872                     vstruct_end(i) - (void *) i->_data);
873 }
874
875 #define btree_node_error(b, c, ptr, fmt, ...)                           \
876         bch2_fs_inconsistent(c,                                         \
877                 "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
878                 (b)->btree_id, (b)->level, btree_node_root(c, b)        \
879                             ? btree_node_root(c, b)->level : -1,        \
880                 PTR_BUCKET_NR(ca, ptr), (b)->written,                   \
881                 le16_to_cpu((i)->u64s), ##__VA_ARGS__)
882
883 static const char *validate_bset(struct bch_fs *c, struct btree *b,
884                                  struct bch_dev *ca,
885                                  const struct bch_extent_ptr *ptr,
886                                  struct bset *i, unsigned sectors,
887                                  unsigned *whiteout_u64s)
888 {
889         struct bkey_packed *k, *prev = NULL;
890         struct bpos prev_pos = POS_MIN;
891         bool seen_non_whiteout = false;
892
893         if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
894                 return "unsupported bset version";
895
896         if (b->written + sectors > c->sb.btree_node_size)
897                 return  "bset past end of btree node";
898
899         if (i != &b->data->keys && !i->u64s)
900                 btree_node_error(b, c, ptr, "empty set");
901
902         if (!BSET_SEPARATE_WHITEOUTS(i)) {
903                 seen_non_whiteout = true;
904                 whiteout_u64s = 0;
905         }
906
907         for (k = i->start;
908              k != vstruct_last(i);) {
909                 struct bkey_s_c u;
910                 struct bkey tmp;
911                 const char *invalid;
912
913                 if (!k->u64s) {
914                         btree_node_error(b, c, ptr,
915                                 "KEY_U64s 0: %zu bytes of metadata lost",
916                                 vstruct_end(i) - (void *) k);
917
918                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
919                         break;
920                 }
921
922                 if (bkey_next(k) > vstruct_last(i)) {
923                         btree_node_error(b, c, ptr,
924                                          "key extends past end of bset");
925
926                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
927                         break;
928                 }
929
930                 if (k->format > KEY_FORMAT_CURRENT) {
931                         btree_node_error(b, c, ptr,
932                                          "invalid bkey format %u", k->format);
933
934                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
935                         memmove_u64s_down(k, bkey_next(k),
936                                           (u64 *) vstruct_end(i) - (u64 *) k);
937                         continue;
938                 }
939
940                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
941                         bch2_bkey_swab(btree_node_type(b), &b->format, k);
942
943                 u = bkey_disassemble(b, k, &tmp);
944
945                 invalid = bch2_btree_bkey_invalid(c, b, u);
946                 if (invalid) {
947                         char buf[160];
948
949                         bch2_bkey_val_to_text(c, btree_node_type(b),
950                                              buf, sizeof(buf), u);
951                         btree_node_error(b, c, ptr,
952                                          "invalid bkey %s: %s", buf, invalid);
953
954                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
955                         memmove_u64s_down(k, bkey_next(k),
956                                           (u64 *) vstruct_end(i) - (u64 *) k);
957                         continue;
958                 }
959
960                 /*
961                  * with the separate whiteouts thing (used for extents), the
962                  * second set of keys actually can have whiteouts too, so we
963                  * can't solely go off bkey_whiteout()...
964                  */
965
966                 if (!seen_non_whiteout &&
967                     (!bkey_whiteout(k) ||
968                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
969                         *whiteout_u64s = k->_data - i->_data;
970                         seen_non_whiteout = true;
971                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
972                         btree_node_error(b, c, ptr,
973                                          "keys out of order: %llu:%llu > %llu:%llu",
974                                          prev_pos.inode,
975                                          prev_pos.offset,
976                                          u.k->p.inode,
977                                          bkey_start_offset(u.k));
978                         /* XXX: repair this */
979                 }
980
981                 prev_pos = u.k->p;
982                 prev = k;
983                 k = bkey_next(k);
984         }
985
986         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
987         return NULL;
988 }
989
990 static bool extent_contains_ptr(struct bkey_s_c_extent e,
991                                 struct bch_extent_ptr match)
992 {
993         const struct bch_extent_ptr *ptr;
994
995         extent_for_each_ptr(e, ptr)
996                 if (!memcmp(ptr, &match, sizeof(*ptr)))
997                         return true;
998
999         return false;
1000 }
1001
1002 void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
1003                               struct bch_dev *ca,
1004                               const struct bch_extent_ptr *ptr)
1005 {
1006         struct btree_node_entry *bne;
1007         struct bset *i = &b->data->keys;
1008         struct btree_node_iter *iter;
1009         struct btree_node *sorted;
1010         bool used_mempool;
1011         unsigned u64s;
1012         const char *err;
1013         struct bch_csum csum;
1014         struct nonce nonce;
1015         int ret;
1016
1017         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1018         __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
1019
1020         err = "dynamic fault";
1021         if (bch2_meta_read_fault("btree"))
1022                 goto err;
1023
1024         while (b->written < c->sb.btree_node_size) {
1025                 unsigned sectors, whiteout_u64s = 0;
1026
1027                 if (!b->written) {
1028                         i = &b->data->keys;
1029
1030                         err = "bad magic";
1031                         if (le64_to_cpu(b->data->magic) != bset_magic(c))
1032                                 goto err;
1033
1034                         err = "bad btree header";
1035                         if (!b->data->keys.seq)
1036                                 goto err;
1037
1038                         err = "unknown checksum type";
1039                         if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1040                                 goto err;
1041
1042                         /* XXX: retry checksum errors */
1043
1044                         nonce = btree_nonce(b, i, b->written << 9);
1045                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1046
1047                         err = "bad checksum";
1048                         if (bch2_crc_cmp(csum, b->data->csum))
1049                                 goto err;
1050
1051                         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1052                                     &b->data->flags,
1053                                     (void *) &b->data->keys -
1054                                     (void *) &b->data->flags);
1055                         nonce = nonce_add(nonce,
1056                                           round_up((void *) &b->data->keys -
1057                                                    (void *) &b->data->flags,
1058                                                    CHACHA20_BLOCK_SIZE));
1059                         bset_encrypt(c, i, nonce);
1060
1061                         sectors = vstruct_sectors(b->data, c->block_bits);
1062
1063                         if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
1064                                 u64 *p = (u64 *) &b->data->ptr;
1065
1066                                 *p = swab64(*p);
1067                                 bch2_bpos_swab(&b->data->min_key);
1068                                 bch2_bpos_swab(&b->data->max_key);
1069                         }
1070
1071                         err = "incorrect btree id";
1072                         if (BTREE_NODE_ID(b->data) != b->btree_id)
1073                                 goto err;
1074
1075                         err = "incorrect level";
1076                         if (BTREE_NODE_LEVEL(b->data) != b->level)
1077                                 goto err;
1078
1079                         err = "incorrect max key";
1080                         if (bkey_cmp(b->data->max_key, b->key.k.p))
1081                                 goto err;
1082
1083                         err = "incorrect backpointer";
1084                         if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
1085                                                  b->data->ptr))
1086                                 goto err;
1087
1088                         err = bch2_bkey_format_validate(&b->data->format);
1089                         if (err)
1090                                 goto err;
1091
1092                         set_btree_bset(b, b->set, &b->data->keys);
1093
1094                         btree_node_set_format(b, b->data->format);
1095                 } else {
1096                         bne = write_block(b);
1097                         i = &bne->keys;
1098
1099                         if (i->seq != b->data->keys.seq)
1100                                 break;
1101
1102                         err = "unknown checksum type";
1103                         if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1104                                 goto err;
1105
1106                         nonce = btree_nonce(b, i, b->written << 9);
1107                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1108
1109                         err = "bad checksum";
1110                         if (memcmp(&csum, &bne->csum, sizeof(csum)))
1111                                 goto err;
1112
1113                         bset_encrypt(c, i, nonce);
1114
1115                         sectors = vstruct_sectors(bne, c->block_bits);
1116                 }
1117
1118                 err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
1119                 if (err)
1120                         goto err;
1121
1122                 b->written += sectors;
1123
1124                 err = "insufficient memory";
1125                 ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1126                 if (ret < 0)
1127                         goto err;
1128
1129                 if (ret)
1130                         continue;
1131
1132                 __bch2_btree_node_iter_push(iter, b,
1133                                            i->start,
1134                                            vstruct_idx(i, whiteout_u64s));
1135
1136                 __bch2_btree_node_iter_push(iter, b,
1137                                            vstruct_idx(i, whiteout_u64s),
1138                                            vstruct_last(i));
1139         }
1140
1141         err = "corrupted btree";
1142         for (bne = write_block(b);
1143              bset_byte_offset(b, bne) < btree_bytes(c);
1144              bne = (void *) bne + block_bytes(c))
1145                 if (bne->keys.seq == b->data->keys.seq)
1146                         goto err;
1147
1148         sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
1149         sorted->keys.u64s = 0;
1150
1151         b->nr = btree_node_is_extents(b)
1152                 ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1153                 : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
1154
1155         u64s = le16_to_cpu(sorted->keys.u64s);
1156         *sorted = *b->data;
1157         sorted->keys.u64s = cpu_to_le16(u64s);
1158         swap(sorted, b->data);
1159         set_btree_bset(b, b->set, &b->data->keys);
1160         b->nsets = 1;
1161
1162         BUG_ON(b->nr.live_u64s != u64s);
1163
1164         btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
1165
1166         bch2_bset_build_aux_tree(b, b->set, false);
1167
1168         set_needs_whiteout(btree_bset_first(b));
1169
1170         btree_node_reset_sib_u64s(b);
1171 out:
1172         mempool_free(iter, &c->fill_iter);
1173         return;
1174 err:
1175         set_btree_node_read_error(b);
1176         btree_node_error(b, c, ptr, "%s", err);
1177         goto out;
1178 }
1179
1180 void bch2_btree_node_read(struct bch_fs *c, struct btree *b)
1181 {
1182         uint64_t start_time = local_clock();
1183         struct bio *bio;
1184         struct extent_pick_ptr pick;
1185
1186         trace_btree_read(c, b);
1187
1188         pick = bch2_btree_pick_ptr(c, b);
1189         if (bch2_fs_fatal_err_on(!pick.ca, c,
1190                                 "no cache device for btree node")) {
1191                 set_btree_node_read_error(b);
1192                 return;
1193         }
1194
1195         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
1196         bio->bi_bdev            = pick.ca->disk_sb.bdev;
1197         bio->bi_iter.bi_sector  = pick.ptr.offset;
1198         bio->bi_iter.bi_size    = btree_bytes(c);
1199         bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
1200         bch2_bio_map(bio, b->data);
1201
1202         submit_bio_wait(bio);
1203
1204         if (bch2_dev_fatal_io_err_on(bio->bi_error,
1205                                   pick.ca, "IO error reading bucket %zu",
1206                                   PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
1207             bch2_meta_read_fault("btree")) {
1208                 set_btree_node_read_error(b);
1209                 goto out;
1210         }
1211
1212         bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
1213         bch2_time_stats_update(&c->btree_read_time, start_time);
1214 out:
1215         bio_put(bio);
1216         percpu_ref_put(&pick.ca->io_ref);
1217 }
1218
1219 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1220                         const struct bkey_i *k, unsigned level)
1221 {
1222         struct closure cl;
1223         struct btree *b;
1224         int ret;
1225
1226         closure_init_stack(&cl);
1227
1228         do {
1229                 ret = bch2_btree_node_cannibalize_lock(c, &cl);
1230                 closure_sync(&cl);
1231         } while (ret);
1232
1233         b = bch2_btree_node_mem_alloc(c);
1234         bch2_btree_node_cannibalize_unlock(c);
1235
1236         BUG_ON(IS_ERR(b));
1237
1238         bkey_copy(&b->key, k);
1239         BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
1240
1241         bch2_btree_node_read(c, b);
1242         six_unlock_write(&b->lock);
1243
1244         if (btree_node_read_error(b)) {
1245                 six_unlock_intent(&b->lock);
1246                 return -EIO;
1247         }
1248
1249         bch2_btree_set_root_initial(c, b, NULL);
1250         six_unlock_intent(&b->lock);
1251
1252         return 0;
1253 }
1254
1255 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1256                               struct btree_write *w)
1257 {
1258         bch2_journal_pin_drop(&c->journal, &w->journal);
1259         closure_wake_up(&w->wait);
1260 }
1261
1262 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1263 {
1264         struct btree_write *w = btree_prev_write(b);
1265
1266         /*
1267          * Before calling bch2_btree_complete_write() - if the write errored, we
1268          * have to halt new journal writes before they see this btree node
1269          * write as completed:
1270          */
1271         if (btree_node_write_error(b))
1272                 bch2_journal_halt(&c->journal);
1273
1274         bch2_btree_complete_write(c, b, w);
1275         btree_node_io_unlock(b);
1276 }
1277
1278 static void btree_node_write_endio(struct bio *bio)
1279 {
1280         struct btree *b = bio->bi_private;
1281         struct bch_write_bio *wbio = to_wbio(bio);
1282         struct bch_fs *c        = wbio->c;
1283         struct bio *orig        = wbio->split ? wbio->orig : NULL;
1284         struct closure *cl      = !wbio->split ? wbio->cl : NULL;
1285         struct bch_dev *ca      = wbio->ca;
1286
1287         if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
1288             bch2_meta_write_fault("btree"))
1289                 set_btree_node_write_error(b);
1290
1291         if (wbio->bounce)
1292                 btree_bounce_free(c,
1293                         wbio->order,
1294                         wbio->used_mempool,
1295                         page_address(bio->bi_io_vec[0].bv_page));
1296
1297         if (wbio->put_bio)
1298                 bio_put(bio);
1299
1300         if (orig) {
1301                 bio_endio(orig);
1302         } else {
1303                 btree_node_write_done(c, b);
1304                 if (cl)
1305                         closure_put(cl);
1306         }
1307
1308         if (ca)
1309                 percpu_ref_put(&ca->io_ref);
1310 }
1311
1312 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1313                             struct closure *parent,
1314                             enum six_lock_type lock_type_held,
1315                             int idx_to_write)
1316 {
1317         struct bio *bio;
1318         struct bch_write_bio *wbio;
1319         struct bset_tree *t;
1320         struct bset *i;
1321         struct btree_node *bn = NULL;
1322         struct btree_node_entry *bne = NULL;
1323         BKEY_PADDED(key) k;
1324         struct bkey_s_extent e;
1325         struct bch_extent_ptr *ptr;
1326         struct sort_iter sort_iter;
1327         struct nonce nonce;
1328         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1329         u64 seq = 0;
1330         bool used_mempool;
1331         unsigned long old, new;
1332         void *data;
1333
1334         /*
1335          * We may only have a read lock on the btree node - the dirty bit is our
1336          * "lock" against racing with other threads that may be trying to start
1337          * a write, we do a write iff we clear the dirty bit. Since setting the
1338          * dirty bit requires a write lock, we can't race with other threads
1339          * redirtying it:
1340          */
1341         do {
1342                 old = new = READ_ONCE(b->flags);
1343
1344                 if (!(old & (1 << BTREE_NODE_dirty)))
1345                         return;
1346
1347                 if (idx_to_write >= 0 &&
1348                     idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
1349                         return;
1350
1351                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1352                         wait_on_bit_io(&b->flags,
1353                                        BTREE_NODE_write_in_flight,
1354                                        TASK_UNINTERRUPTIBLE);
1355                         continue;
1356                 }
1357
1358                 new &= ~(1 << BTREE_NODE_dirty);
1359                 new |=  (1 << BTREE_NODE_write_in_flight);
1360                 new |=  (1 << BTREE_NODE_just_written);
1361                 new ^=  (1 << BTREE_NODE_write_idx);
1362         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1363
1364         BUG_ON(!list_empty(&b->write_blocked));
1365
1366         BUG_ON(b->written >= c->sb.btree_node_size);
1367         BUG_ON(bset_written(b, btree_bset_last(b)));
1368         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1369         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1370
1371         if (lock_type_held == SIX_LOCK_intent) {
1372                 six_lock_write(&b->lock);
1373                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
1374                 six_unlock_write(&b->lock);
1375         } else {
1376                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1377         }
1378
1379         BUG_ON(b->uncompacted_whiteout_u64s);
1380
1381         sort_iter_init(&sort_iter, b);
1382
1383         bytes = !b->written
1384                 ? sizeof(struct btree_node)
1385                 : sizeof(struct btree_node_entry);
1386
1387         bytes += b->whiteout_u64s * sizeof(u64);
1388
1389         for_each_bset(b, t) {
1390                 i = bset(b, t);
1391
1392                 if (bset_written(b, i))
1393                         continue;
1394
1395                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1396                 sort_iter_add(&sort_iter,
1397                               btree_bkey_first(b, t),
1398                               btree_bkey_last(b, t));
1399                 seq = max(seq, le64_to_cpu(i->journal_seq));
1400         }
1401
1402         order = get_order(bytes);
1403         data = btree_bounce_alloc(c, order, &used_mempool);
1404
1405         if (!b->written) {
1406                 bn = data;
1407                 *bn = *b->data;
1408                 i = &bn->keys;
1409         } else {
1410                 bne = data;
1411                 bne->keys = b->data->keys;
1412                 i = &bne->keys;
1413         }
1414
1415         i->journal_seq  = cpu_to_le64(seq);
1416         i->u64s         = 0;
1417
1418         if (!btree_node_is_extents(b)) {
1419                 sort_iter_add(&sort_iter,
1420                               unwritten_whiteouts_start(c, b),
1421                               unwritten_whiteouts_end(c, b));
1422                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1423         } else {
1424                 memcpy_u64s(i->start,
1425                             unwritten_whiteouts_start(c, b),
1426                             b->whiteout_u64s);
1427                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1428                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1429         }
1430
1431         b->whiteout_u64s = 0;
1432
1433         u64s = btree_node_is_extents(b)
1434                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1435                 : sort_keys(i->start, &sort_iter, false);
1436         le16_add_cpu(&i->u64s, u64s);
1437
1438         clear_needs_whiteout(i);
1439
1440         if (b->written && !i->u64s) {
1441                 /* Nothing to write: */
1442                 btree_bounce_free(c, order, used_mempool, data);
1443                 btree_node_write_done(c, b);
1444                 return;
1445         }
1446
1447         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1448         BUG_ON(i->seq != b->data->keys.seq);
1449
1450         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1451         SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
1452
1453         nonce = btree_nonce(b, i, b->written << 9);
1454
1455         if (bn) {
1456                 bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1457                             &bn->flags,
1458                             (void *) &b->data->keys -
1459                             (void *) &b->data->flags);
1460                 nonce = nonce_add(nonce,
1461                                   round_up((void *) &b->data->keys -
1462                                            (void *) &b->data->flags,
1463                                            CHACHA20_BLOCK_SIZE));
1464                 bset_encrypt(c, i, nonce);
1465
1466                 nonce = btree_nonce(b, i, b->written << 9);
1467                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1468         } else {
1469                 bset_encrypt(c, i, nonce);
1470
1471                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1472         }
1473
1474         bytes_to_write = vstruct_end(i) - data;
1475         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1476
1477         memset(data + bytes_to_write, 0,
1478                (sectors_to_write << 9) - bytes_to_write);
1479
1480         BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
1481
1482         trace_btree_write(b, bytes_to_write, sectors_to_write);
1483
1484         /*
1485          * We handle btree write errors by immediately halting the journal -
1486          * after we've done that, we can't issue any subsequent btree writes
1487          * because they might have pointers to new nodes that failed to write.
1488          *
1489          * Furthermore, there's no point in doing any more btree writes because
1490          * with the journal stopped, we're never going to update the journal to
1491          * reflect that those writes were done and the data flushed from the
1492          * journal:
1493          *
1494          * Make sure to update b->written so bch2_btree_init_next() doesn't
1495          * break:
1496          */
1497         if (bch2_journal_error(&c->journal) ||
1498             c->opts.nochanges) {
1499                 set_btree_node_noevict(b);
1500                 b->written += sectors_to_write;
1501
1502                 btree_bounce_free(c, order, used_mempool, data);
1503                 btree_node_write_done(c, b);
1504                 return;
1505         }
1506
1507         bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
1508
1509         wbio                    = to_wbio(bio);
1510         wbio->cl                = parent;
1511         wbio->bounce            = true;
1512         wbio->put_bio           = true;
1513         wbio->order             = order;
1514         wbio->used_mempool      = used_mempool;
1515         bio->bi_iter.bi_size    = sectors_to_write << 9;
1516         bio->bi_end_io          = btree_node_write_endio;
1517         bio->bi_private         = b;
1518         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
1519
1520         if (parent)
1521                 closure_get(parent);
1522
1523         bch2_bio_map(bio, data);
1524
1525         /*
1526          * If we're appending to a leaf node, we don't technically need FUA -
1527          * this write just needs to be persisted before the next journal write,
1528          * which will be marked FLUSH|FUA.
1529          *
1530          * Similarly if we're writing a new btree root - the pointer is going to
1531          * be in the next journal entry.
1532          *
1533          * But if we're writing a new btree node (that isn't a root) or
1534          * appending to a non leaf btree node, we need either FUA or a flush
1535          * when we write the parent with the new pointer. FUA is cheaper than a
1536          * flush, and writes appending to leaf nodes aren't blocking anything so
1537          * just make all btree node writes FUA to keep things sane.
1538          */
1539
1540         bkey_copy(&k.key, &b->key);
1541         e = bkey_i_to_s_extent(&k.key);
1542
1543         extent_for_each_ptr(e, ptr)
1544                 ptr->offset += b->written;
1545
1546         extent_for_each_ptr(e, ptr)
1547                 atomic64_add(sectors_to_write,
1548                              &c->devs[ptr->dev]->btree_sectors_written);
1549
1550         b->written += sectors_to_write;
1551
1552         bch2_submit_wbio_replicas(wbio, c, &k.key);
1553 }
1554
1555 /*
1556  * Work that must be done with write lock held:
1557  */
1558 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
1559 {
1560         bool invalidated_iter = false;
1561         struct btree_node_entry *bne;
1562         struct bset_tree *t;
1563
1564         if (!btree_node_just_written(b))
1565                 return false;
1566
1567         BUG_ON(b->whiteout_u64s);
1568         BUG_ON(b->uncompacted_whiteout_u64s);
1569
1570         clear_btree_node_just_written(b);
1571
1572         /*
1573          * Note: immediately after write, bset_unwritten()/bset_written() don't
1574          * work - the amount of data we had to write after compaction might have
1575          * been smaller than the offset of the last bset.
1576          *
1577          * However, we know that all bsets have been written here, as long as
1578          * we're still holding the write lock:
1579          */
1580
1581         /*
1582          * XXX: decide if we really want to unconditionally sort down to a
1583          * single bset:
1584          */
1585         if (b->nsets > 1) {
1586                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1587                 invalidated_iter = true;
1588         } else {
1589                 invalidated_iter = bch2_drop_whiteouts(b);
1590         }
1591
1592         for_each_bset(b, t)
1593                 set_needs_whiteout(bset(b, t));
1594
1595         bch2_btree_verify(c, b);
1596
1597         /*
1598          * If later we don't unconditionally sort down to a single bset, we have
1599          * to ensure this is still true:
1600          */
1601         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1602
1603         bne = want_new_bset(c, b);
1604         if (bne)
1605                 bch2_bset_init_next(b, &bne->keys);
1606
1607         bch2_btree_build_aux_trees(b);
1608
1609         return invalidated_iter;
1610 }
1611
1612 /*
1613  * Use this one if the node is intent locked:
1614  */
1615 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1616                           struct closure *parent,
1617                           enum six_lock_type lock_type_held,
1618                           int idx_to_write)
1619 {
1620         BUG_ON(lock_type_held == SIX_LOCK_write);
1621
1622         if (lock_type_held == SIX_LOCK_intent ||
1623             six_trylock_convert(&b->lock, SIX_LOCK_read,
1624                                 SIX_LOCK_intent)) {
1625                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
1626
1627                 six_lock_write(&b->lock);
1628                 bch2_btree_post_write_cleanup(c, b);
1629                 six_unlock_write(&b->lock);
1630
1631                 if (lock_type_held == SIX_LOCK_read)
1632                         six_lock_downgrade(&b->lock);
1633         } else {
1634                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
1635         }
1636 }
1637
1638 static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b,
1639                                        struct closure *parent)
1640 {
1641         six_lock_read(&b->lock);
1642         BUG_ON(b->level);
1643
1644         bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
1645         six_unlock_read(&b->lock);
1646 }
1647
1648 /*
1649  * Write all dirty btree nodes to disk, including roots
1650  */
1651 void bch2_btree_flush(struct bch_fs *c)
1652 {
1653         struct closure cl;
1654         struct btree *b;
1655         struct bucket_table *tbl;
1656         struct rhash_head *pos;
1657         bool dropped_lock;
1658         unsigned i;
1659
1660         closure_init_stack(&cl);
1661
1662         rcu_read_lock();
1663
1664         do {
1665                 dropped_lock = false;
1666                 i = 0;
1667 restart:
1668                 tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
1669                                           &c->btree_cache_table);
1670
1671                 for (; i < tbl->size; i++)
1672                         rht_for_each_entry_rcu(b, pos, tbl, i, hash)
1673                                 /*
1674                                  * XXX - locking for b->level, when called from
1675                                  * bch2_journal_move()
1676                                  */
1677                                 if (!b->level && btree_node_dirty(b)) {
1678                                         rcu_read_unlock();
1679                                         bch2_btree_node_write_dirty(c, b, &cl);
1680                                         dropped_lock = true;
1681                                         rcu_read_lock();
1682                                         goto restart;
1683                                 }
1684         } while (dropped_lock);
1685
1686         rcu_read_unlock();
1687
1688         closure_sync(&cl);
1689 }
1690
1691 /**
1692  * bch_btree_node_flush_journal - flush any journal entries that contain keys
1693  * from this node
1694  *
1695  * The bset's journal sequence number is used for preserving ordering of index
1696  * updates across unclean shutdowns - it's used to ignore bsets newer than the
1697  * most recent journal entry.
1698  *
1699  * But when rewriting btree nodes we compact all the bsets in a btree node - and
1700  * if we compacted a bset that should be ignored with bsets we do need, that
1701  * would be bad. So to avoid that, prior to making the new node visible ensure
1702  * that the journal has been flushed so that all the bsets we compacted should
1703  * be visible.
1704  */
1705 void bch2_btree_node_flush_journal_entries(struct bch_fs *c,
1706                                           struct btree *b,
1707                                           struct closure *cl)
1708 {
1709         int i = b->nsets;
1710
1711         /*
1712          * Journal sequence numbers in the different bsets will always be in
1713          * ascending order, we only need to flush the highest - except that the
1714          * most recent bset might not have a journal sequence number yet, so we
1715          * need to loop:
1716          */
1717         while (i--) {
1718                 u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
1719
1720                 if (seq) {
1721                         bch2_journal_flush_seq_async(&c->journal, seq, cl);
1722                         break;
1723                 }
1724         }
1725 }