]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/btree_io.c
e772c6adf389f5b4e6b61ddf4cde250051c24b56
[bcachefs-tools-debian] / libbcache / btree_io.c
1
2 #include "bcache.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_update.h"
6 #include "btree_io.h"
7 #include "btree_iter.h"
8 #include "btree_locking.h"
9 #include "buckets.h"
10 #include "checksum.h"
11 #include "debug.h"
12 #include "error.h"
13 #include "extents.h"
14 #include "io.h"
15 #include "journal.h"
16 #include "super-io.h"
17
18 #include <trace/events/bcache.h>
19
20 static void verify_no_dups(struct btree *b,
21                            struct bkey_packed *start,
22                            struct bkey_packed *end)
23 {
24 #ifdef CONFIG_BCACHE_DEBUG
25         struct bkey_packed *k;
26
27         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
28                 struct bkey l = bkey_unpack_key(b, k);
29                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
30
31                 BUG_ON(btree_node_is_extents(b)
32                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
33                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
34                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
35         }
36 #endif
37 }
38
39 static void clear_needs_whiteout(struct bset *i)
40 {
41         struct bkey_packed *k;
42
43         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
44                 k->needs_whiteout = false;
45 }
46
47 static void set_needs_whiteout(struct bset *i)
48 {
49         struct bkey_packed *k;
50
51         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
52                 k->needs_whiteout = true;
53 }
54
55 static void btree_bounce_free(struct cache_set *c, unsigned order,
56                               bool used_mempool, void *p)
57 {
58         if (used_mempool)
59                 mempool_free(virt_to_page(p), &c->btree_bounce_pool);
60         else
61                 free_pages((unsigned long) p, order);
62 }
63
64 static void *btree_bounce_alloc(struct cache_set *c, unsigned order,
65                                 bool *used_mempool)
66 {
67         void *p;
68
69         BUG_ON(1 << order > btree_pages(c));
70
71         *used_mempool = false;
72         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
73         if (p)
74                 return p;
75
76         *used_mempool = true;
77         return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
78 }
79
80 typedef int (*sort_cmp_fn)(struct btree *,
81                            struct bkey_packed *,
82                            struct bkey_packed *);
83
84 struct sort_iter {
85         struct btree    *b;
86         unsigned                used;
87
88         struct sort_iter_set {
89                 struct bkey_packed *k, *end;
90         } data[MAX_BSETS + 1];
91 };
92
93 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
94 {
95         memset(iter, 0, sizeof(*iter));
96         iter->b = b;
97 }
98
99 static inline void __sort_iter_sift(struct sort_iter *iter,
100                                     unsigned from,
101                                     sort_cmp_fn cmp)
102 {
103         unsigned i;
104
105         for (i = from;
106              i + 1 < iter->used &&
107              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
108              i++)
109                 swap(iter->data[i], iter->data[i + 1]);
110 }
111
112 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
113 {
114
115         __sort_iter_sift(iter, 0, cmp);
116 }
117
118 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
119 {
120         unsigned i = iter->used;
121
122         while (i--)
123                 __sort_iter_sift(iter, i, cmp);
124 }
125
126 static void sort_iter_add(struct sort_iter *iter,
127                           struct bkey_packed *k,
128                           struct bkey_packed *end)
129 {
130         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
131
132         if (k != end)
133                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
134 }
135
136 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
137 {
138         return iter->used ? iter->data->k : NULL;
139 }
140
141 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
142 {
143         iter->data->k = bkey_next(iter->data->k);
144
145         BUG_ON(iter->data->k > iter->data->end);
146
147         if (iter->data->k == iter->data->end)
148                 memmove(&iter->data[0],
149                         &iter->data[1],
150                         sizeof(iter->data[0]) * --iter->used);
151         else
152                 sort_iter_sift(iter, cmp);
153 }
154
155 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
156                                                  sort_cmp_fn cmp)
157 {
158         struct bkey_packed *ret = sort_iter_peek(iter);
159
160         if (ret)
161                 sort_iter_advance(iter, cmp);
162
163         return ret;
164 }
165
166 static inline int sort_key_whiteouts_cmp(struct btree *b,
167                                          struct bkey_packed *l,
168                                          struct bkey_packed *r)
169 {
170         return bkey_cmp_packed(b, l, r);
171 }
172
173 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
174                                    struct sort_iter *iter)
175 {
176         struct bkey_packed *in, *out = dst;
177
178         sort_iter_sort(iter, sort_key_whiteouts_cmp);
179
180         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
181                 bkey_copy(out, in);
182                 out = bkey_next(out);
183         }
184
185         return (u64 *) out - (u64 *) dst;
186 }
187
188 static inline int sort_extent_whiteouts_cmp(struct btree *b,
189                                             struct bkey_packed *l,
190                                             struct bkey_packed *r)
191 {
192         struct bkey ul = bkey_unpack_key(b, l);
193         struct bkey ur = bkey_unpack_key(b, r);
194
195         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
196 }
197
198 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
199                                       struct sort_iter *iter)
200 {
201         const struct bkey_format *f = &iter->b->format;
202         struct bkey_packed *in, *out = dst;
203         struct bkey_i l, r;
204         bool prev = false, l_packed = false;
205         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
206         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
207         u64 new_size;
208
209         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
210
211         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
212
213         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
214                 EBUG_ON(bkeyp_val_u64s(f, in));
215                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
216
217                 r.k = bkey_unpack_key(iter->b, in);
218
219                 if (prev &&
220                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
221                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
222                                 continue;
223
224                         new_size = l_packed
225                                 ? min(max_packed_size, max_packed_offset -
226                                       bkey_start_offset(&l.k))
227                                 : KEY_SIZE_MAX;
228
229                         new_size = min(new_size, r.k.p.offset -
230                                        bkey_start_offset(&l.k));
231
232                         BUG_ON(new_size < l.k.size);
233
234                         bch_key_resize(&l.k, new_size);
235
236                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
237                                 continue;
238
239                         bch_cut_front(l.k.p, &r);
240                 }
241
242                 if (prev) {
243                         if (!bkey_pack(out, &l, f)) {
244                                 BUG_ON(l_packed);
245                                 bkey_copy(out, &l);
246                         }
247                         out = bkey_next(out);
248                 }
249
250                 l = r;
251                 prev = true;
252                 l_packed = bkey_packed(in);
253         }
254
255         if (prev) {
256                 if (!bkey_pack(out, &l, f)) {
257                         BUG_ON(l_packed);
258                         bkey_copy(out, &l);
259                 }
260                 out = bkey_next(out);
261         }
262
263         return (u64 *) out - (u64 *) dst;
264 }
265
266 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
267                                     bool compacting,
268                                     enum compact_mode mode)
269 {
270         unsigned live_u64s = b->nr.bset_u64s[t - b->set];
271         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
272
273         if (live_u64s == bset_u64s)
274                 return 0;
275
276         if (mode == COMPACT_LAZY) {
277                 if (live_u64s * 4 < bset_u64s * 3 ||
278                     (compacting && bset_unwritten(b, bset(b, t))))
279                         return bset_u64s - live_u64s;
280         } else {
281                 if (bset_written(b, bset(b, t)))
282                         return bset_u64s - live_u64s;
283         }
284
285         return 0;
286 }
287
288 bool __bch_compact_whiteouts(struct cache_set *c, struct btree *b,
289                              enum compact_mode mode)
290 {
291         const struct bkey_format *f = &b->format;
292         struct bset_tree *t;
293         struct bkey_packed *whiteouts = NULL;
294         struct bkey_packed *u_start, *u_pos;
295         struct sort_iter sort_iter;
296         unsigned order, whiteout_u64s = 0, u64s;
297         bool used_mempool, compacting = false;
298
299         for_each_bset(b, t)
300                 whiteout_u64s += should_compact_bset(b, t,
301                                         whiteout_u64s != 0, mode);
302
303         if (!whiteout_u64s)
304                 return false;
305
306         sort_iter_init(&sort_iter, b);
307
308         whiteout_u64s += b->whiteout_u64s;
309         order = get_order(whiteout_u64s * sizeof(u64));
310
311         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
312         u_start = u_pos = whiteouts;
313
314         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
315                     b->whiteout_u64s);
316         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
317
318         sort_iter_add(&sort_iter, u_start, u_pos);
319
320         for_each_bset(b, t) {
321                 struct bset *i = bset(b, t);
322                 struct bkey_packed *k, *n, *out, *start, *end;
323                 struct btree_node_entry *src = NULL, *dst = NULL;
324
325                 if (t != b->set && bset_unwritten(b, i)) {
326                         src = container_of(i, struct btree_node_entry, keys);
327                         dst = max(write_block(b),
328                                   (void *) btree_bkey_last(b, t -1));
329                 }
330
331                 if (!should_compact_bset(b, t, compacting, mode)) {
332                         if (src != dst) {
333                                 memmove(dst, src, sizeof(*src) +
334                                         le16_to_cpu(src->keys.u64s) *
335                                         sizeof(u64));
336                                 i = &dst->keys;
337                                 set_btree_bset(b, t, i);
338                         }
339                         continue;
340                 }
341
342                 compacting = true;
343                 u_start = u_pos;
344                 start = i->start;
345                 end = vstruct_last(i);
346
347                 if (src != dst) {
348                         memmove(dst, src, sizeof(*src));
349                         i = &dst->keys;
350                         set_btree_bset(b, t, i);
351                 }
352
353                 out = i->start;
354
355                 for (k = start; k != end; k = n) {
356                         n = bkey_next(k);
357
358                         if (bkey_deleted(k) && btree_node_is_extents(b))
359                                 continue;
360
361                         if (bkey_whiteout(k) && !k->needs_whiteout)
362                                 continue;
363
364                         if (bkey_whiteout(k)) {
365                                 unreserve_whiteout(b, t, k);
366                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
367                                 set_bkeyp_val_u64s(f, u_pos, 0);
368                                 u_pos = bkey_next(u_pos);
369                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
370                                 bkey_copy(out, k);
371                                 out = bkey_next(out);
372                         }
373                 }
374
375                 sort_iter_add(&sort_iter, u_start, u_pos);
376
377                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
378                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
379                         set_btree_bset_end(b, t);
380                         bch_bset_set_no_aux_tree(b, t);
381                 }
382         }
383
384         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
385
386         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
387                (void *) btree_bkey_last(b, bset_tree_last(b)));
388
389         u64s = btree_node_is_extents(b)
390                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
391                                         &sort_iter)
392                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
393                                      &sort_iter);
394
395         BUG_ON(u64s > b->whiteout_u64s);
396         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
397         BUG_ON(u_pos != whiteouts && !u64s);
398
399         if (u64s != b->whiteout_u64s) {
400                 void *src = unwritten_whiteouts_start(c, b);
401
402                 b->whiteout_u64s = u64s;
403                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
404         }
405
406         verify_no_dups(b,
407                        unwritten_whiteouts_start(c, b),
408                        unwritten_whiteouts_end(c, b));
409
410         btree_bounce_free(c, order, used_mempool, whiteouts);
411
412         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
413                 bch_btree_build_aux_trees(b);
414
415         bch_btree_keys_u64s_remaining(c, b);
416         bch_verify_btree_nr_keys(b);
417
418         return true;
419 }
420
421 static bool bch_drop_whiteouts(struct btree *b)
422 {
423         struct bset_tree *t;
424         bool ret = false;
425
426         for_each_bset(b, t) {
427                 struct bset *i = bset(b, t);
428                 struct bkey_packed *k, *n, *out, *start, *end;
429
430                 if (!should_compact_bset(b, t, true, true))
431                         continue;
432
433                 start   = btree_bkey_first(b, t);
434                 end     = btree_bkey_last(b, t);
435
436                 if (bset_unwritten(b, i) &&
437                     t != b->set) {
438                         struct bset *dst =
439                                max_t(struct bset *, write_block(b),
440                                      (void *) btree_bkey_last(b, t -1));
441
442                         memmove(dst, i, sizeof(struct bset));
443                         i = dst;
444                         set_btree_bset(b, t, i);
445                 }
446
447                 out = i->start;
448
449                 for (k = start; k != end; k = n) {
450                         n = bkey_next(k);
451
452                         if (!bkey_whiteout(k)) {
453                                 bkey_copy(out, k);
454                                 out = bkey_next(out);
455                         }
456                 }
457
458                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
459                 bch_bset_set_no_aux_tree(b, t);
460                 ret = true;
461         }
462
463         bch_verify_btree_nr_keys(b);
464
465         return ret;
466 }
467
468 static inline int sort_keys_cmp(struct btree *b,
469                                 struct bkey_packed *l,
470                                 struct bkey_packed *r)
471 {
472         return bkey_cmp_packed(b, l, r) ?:
473                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
474                 (int) l->needs_whiteout - (int) r->needs_whiteout;
475 }
476
477 static unsigned sort_keys(struct bkey_packed *dst,
478                           struct sort_iter *iter,
479                           bool filter_whiteouts)
480 {
481         const struct bkey_format *f = &iter->b->format;
482         struct bkey_packed *in, *next, *out = dst;
483
484         sort_iter_sort(iter, sort_keys_cmp);
485
486         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
487                 if (bkey_whiteout(in) &&
488                     (filter_whiteouts || !in->needs_whiteout))
489                         continue;
490
491                 if (bkey_whiteout(in) &&
492                     (next = sort_iter_peek(iter)) &&
493                     !bkey_cmp_packed(iter->b, in, next)) {
494                         BUG_ON(in->needs_whiteout &&
495                                next->needs_whiteout);
496                         /*
497                          * XXX racy, called with read lock from write path
498                          *
499                          * leads to spurious BUG_ON() in bkey_unpack_key() in
500                          * debug mode
501                          */
502                         next->needs_whiteout |= in->needs_whiteout;
503                         continue;
504                 }
505
506                 if (bkey_whiteout(in)) {
507                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
508                         set_bkeyp_val_u64s(f, out, 0);
509                 } else {
510                         bkey_copy(out, in);
511                 }
512                 out = bkey_next(out);
513         }
514
515         return (u64 *) out - (u64 *) dst;
516 }
517
518 static inline int sort_extents_cmp(struct btree *b,
519                                    struct bkey_packed *l,
520                                    struct bkey_packed *r)
521 {
522         return bkey_cmp_packed(b, l, r) ?:
523                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
524 }
525
526 static unsigned sort_extents(struct bkey_packed *dst,
527                              struct sort_iter *iter,
528                              bool filter_whiteouts)
529 {
530         struct bkey_packed *in, *out = dst;
531
532         sort_iter_sort(iter, sort_extents_cmp);
533
534         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
535                 if (bkey_deleted(in))
536                         continue;
537
538                 if (bkey_whiteout(in) &&
539                     (filter_whiteouts || !in->needs_whiteout))
540                         continue;
541
542                 bkey_copy(out, in);
543                 out = bkey_next(out);
544         }
545
546         return (u64 *) out - (u64 *) dst;
547 }
548
549 static void btree_node_sort(struct cache_set *c, struct btree *b,
550                             struct btree_iter *iter,
551                             unsigned start_idx,
552                             unsigned end_idx,
553                             bool filter_whiteouts)
554 {
555         struct btree_node *out;
556         struct sort_iter sort_iter;
557         struct bset_tree *t;
558         struct bset *start_bset = bset(b, &b->set[start_idx]);
559         bool used_mempool = false;
560         u64 start_time;
561         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
562         bool sorting_entire_node = start_idx == 0 &&
563                 end_idx == b->nsets;
564
565         sort_iter_init(&sort_iter, b);
566
567         for (t = b->set + start_idx;
568              t < b->set + end_idx;
569              t++) {
570                 u64s += le16_to_cpu(bset(b, t)->u64s);
571                 sort_iter_add(&sort_iter,
572                               btree_bkey_first(b, t),
573                               btree_bkey_last(b, t));
574         }
575
576         order = sorting_entire_node
577                 ? btree_page_order(c)
578                 : get_order(__vstruct_bytes(struct btree_node, u64s));
579
580         out = btree_bounce_alloc(c, order, &used_mempool);
581
582         start_time = local_clock();
583
584         if (btree_node_is_extents(b))
585                 filter_whiteouts = bset_written(b, start_bset);
586
587         u64s = btree_node_is_extents(b)
588                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
589                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
590
591         out->keys.u64s = cpu_to_le16(u64s);
592
593         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
594
595         if (sorting_entire_node)
596                 bch_time_stats_update(&c->btree_sort_time, start_time);
597
598         /* Make sure we preserve bset journal_seq: */
599         for (t = b->set + start_idx + 1;
600              t < b->set + end_idx;
601              t++)
602                 start_bset->journal_seq =
603                         max(start_bset->journal_seq,
604                             bset(b, t)->journal_seq);
605
606         if (sorting_entire_node) {
607                 unsigned u64s = le16_to_cpu(out->keys.u64s);
608
609                 BUG_ON(order != btree_page_order(c));
610
611                 /*
612                  * Our temporary buffer is the same size as the btree node's
613                  * buffer, we can just swap buffers instead of doing a big
614                  * memcpy()
615                  */
616                 *out = *b->data;
617                 out->keys.u64s = cpu_to_le16(u64s);
618                 swap(out, b->data);
619                 set_btree_bset(b, b->set, &b->data->keys);
620         } else {
621                 start_bset->u64s = out->keys.u64s;
622                 memcpy_u64s(start_bset->start,
623                             out->keys.start,
624                             le16_to_cpu(out->keys.u64s));
625         }
626
627         for (i = start_idx + 1; i < end_idx; i++)
628                 b->nr.bset_u64s[start_idx] +=
629                         b->nr.bset_u64s[i];
630
631         b->nsets -= shift;
632
633         for (i = start_idx + 1; i < b->nsets; i++) {
634                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
635                 b->set[i]               = b->set[i + shift];
636         }
637
638         for (i = b->nsets; i < MAX_BSETS; i++)
639                 b->nr.bset_u64s[i] = 0;
640
641         set_btree_bset_end(b, &b->set[start_idx]);
642         bch_bset_set_no_aux_tree(b, &b->set[start_idx]);
643
644         btree_bounce_free(c, order, used_mempool, out);
645
646         bch_verify_btree_nr_keys(b);
647 }
648
649 /* Sort + repack in a new format: */
650 static struct btree_nr_keys sort_repack(struct bset *dst,
651                                         struct btree *src,
652                                         struct btree_node_iter *src_iter,
653                                         struct bkey_format *out_f,
654                                         bool filter_whiteouts)
655 {
656         struct bkey_format *in_f = &src->format;
657         struct bkey_packed *in, *out = vstruct_last(dst);
658         struct btree_nr_keys nr;
659
660         memset(&nr, 0, sizeof(nr));
661
662         while ((in = bch_btree_node_iter_next_all(src_iter, src))) {
663                 if (filter_whiteouts && bkey_whiteout(in))
664                         continue;
665
666                 if (bch_bkey_transform(out_f, out, bkey_packed(in)
667                                        ? in_f : &bch_bkey_format_current, in))
668                         out->format = KEY_FORMAT_LOCAL_BTREE;
669                 else
670                         bkey_unpack(src, (void *) out, in);
671
672                 btree_keys_account_key_add(&nr, 0, out);
673                 out = bkey_next(out);
674         }
675
676         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
677         return nr;
678 }
679
680 /* Sort, repack, and merge: */
681 static struct btree_nr_keys sort_repack_merge(struct cache_set *c,
682                                               struct bset *dst,
683                                               struct btree *src,
684                                               struct btree_node_iter *iter,
685                                               struct bkey_format *out_f,
686                                               bool filter_whiteouts,
687                                               key_filter_fn filter,
688                                               key_merge_fn merge)
689 {
690         struct bkey_packed *k, *prev = NULL, *out;
691         struct btree_nr_keys nr;
692         BKEY_PADDED(k) tmp;
693
694         memset(&nr, 0, sizeof(nr));
695
696         while ((k = bch_btree_node_iter_next_all(iter, src))) {
697                 if (filter_whiteouts && bkey_whiteout(k))
698                         continue;
699
700                 /*
701                  * The filter might modify pointers, so we have to unpack the
702                  * key and values to &tmp.k:
703                  */
704                 bkey_unpack(src, &tmp.k, k);
705
706                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
707                         continue;
708
709                 /* prev is always unpacked, for key merging: */
710
711                 if (prev &&
712                     merge &&
713                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
714                         continue;
715
716                 /*
717                  * the current key becomes the new prev: advance prev, then
718                  * copy the current key - but first pack prev (in place):
719                  */
720                 if (prev) {
721                         bkey_pack(prev, (void *) prev, out_f);
722
723                         btree_keys_account_key_add(&nr, 0, prev);
724                         prev = bkey_next(prev);
725                 } else {
726                         prev = vstruct_last(dst);
727                 }
728
729                 bkey_copy(prev, &tmp.k);
730         }
731
732         if (prev) {
733                 bkey_pack(prev, (void *) prev, out_f);
734                 btree_keys_account_key_add(&nr, 0, prev);
735                 out = bkey_next(prev);
736         } else {
737                 out = vstruct_last(dst);
738         }
739
740         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
741         return nr;
742 }
743
744 void bch_btree_sort_into(struct cache_set *c,
745                          struct btree *dst,
746                          struct btree *src)
747 {
748         struct btree_nr_keys nr;
749         struct btree_node_iter src_iter;
750         u64 start_time = local_clock();
751
752         BUG_ON(dst->nsets != 1);
753
754         bch_bset_set_no_aux_tree(dst, dst->set);
755
756         bch_btree_node_iter_init_from_start(&src_iter, src,
757                                             btree_node_is_extents(src));
758
759         if (btree_node_ops(src)->key_normalize ||
760             btree_node_ops(src)->key_merge)
761                 nr = sort_repack_merge(c, btree_bset_first(dst),
762                                 src, &src_iter,
763                                 &dst->format,
764                                 true,
765                                 btree_node_ops(src)->key_normalize,
766                                 btree_node_ops(src)->key_merge);
767         else
768                 nr = sort_repack(btree_bset_first(dst),
769                                 src, &src_iter,
770                                 &dst->format,
771                                 true);
772
773         bch_time_stats_update(&c->btree_sort_time, start_time);
774
775         set_btree_bset_end(dst, dst->set);
776
777         dst->nr.live_u64s       += nr.live_u64s;
778         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
779         dst->nr.packed_keys     += nr.packed_keys;
780         dst->nr.unpacked_keys   += nr.unpacked_keys;
781
782         bch_verify_btree_nr_keys(dst);
783 }
784
785 #define SORT_CRIT       (4096 / sizeof(u64))
786
787 /*
788  * We're about to add another bset to the btree node, so if there's currently
789  * too many bsets - sort some of them together:
790  */
791 static bool btree_node_compact(struct cache_set *c, struct btree *b,
792                                struct btree_iter *iter)
793 {
794         unsigned unwritten_idx;
795         bool ret = false;
796
797         for (unwritten_idx = 0;
798              unwritten_idx < b->nsets;
799              unwritten_idx++)
800                 if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
801                         break;
802
803         if (b->nsets - unwritten_idx > 1) {
804                 btree_node_sort(c, b, iter, unwritten_idx,
805                                 b->nsets, false);
806                 ret = true;
807         }
808
809         if (unwritten_idx > 1) {
810                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
811                 ret = true;
812         }
813
814         return ret;
815 }
816
817 void bch_btree_build_aux_trees(struct btree *b)
818 {
819         struct bset_tree *t;
820
821         for_each_bset(b, t)
822                 bch_bset_build_aux_tree(b, t,
823                                 bset_unwritten(b, bset(b, t)) &&
824                                 t == bset_tree_last(b));
825 }
826
827 /*
828  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
829  * inserted into
830  *
831  * Safe to call if there already is an unwritten bset - will only add a new bset
832  * if @b doesn't already have one.
833  *
834  * Returns true if we sorted (i.e. invalidated iterators
835  */
836 void bch_btree_init_next(struct cache_set *c, struct btree *b,
837                          struct btree_iter *iter)
838 {
839         struct btree_node_entry *bne;
840         bool did_sort;
841
842         EBUG_ON(!(b->lock.state.seq & 1));
843         EBUG_ON(iter && iter->nodes[b->level] != b);
844
845         did_sort = btree_node_compact(c, b, iter);
846
847         bne = want_new_bset(c, b);
848         if (bne)
849                 bch_bset_init_next(b, &bne->keys);
850
851         bch_btree_build_aux_trees(b);
852
853         if (iter && did_sort)
854                 bch_btree_iter_reinit_node(iter, b);
855 }
856
857 static struct nonce btree_nonce(struct btree *b,
858                                 struct bset *i,
859                                 unsigned offset)
860 {
861         return (struct nonce) {{
862                 [0] = cpu_to_le32(offset),
863                 [1] = ((__le32 *) &i->seq)[0],
864                 [2] = ((__le32 *) &i->seq)[1],
865                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
866         }};
867 }
868
869 static void bset_encrypt(struct cache_set *c, struct bset *i, struct nonce nonce)
870 {
871         bch_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
872                     vstruct_end(i) - (void *) i->_data);
873 }
874
875 #define btree_node_error(b, c, ptr, fmt, ...)                           \
876         cache_set_inconsistent(c,                                       \
877                 "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
878                 (b)->btree_id, (b)->level, btree_node_root(c, b)        \
879                             ? btree_node_root(c, b)->level : -1,        \
880                 PTR_BUCKET_NR(ca, ptr), (b)->written,                   \
881                 le16_to_cpu((i)->u64s), ##__VA_ARGS__)
882
883 static const char *validate_bset(struct cache_set *c, struct btree *b,
884                                  struct cache *ca,
885                                  const struct bch_extent_ptr *ptr,
886                                  struct bset *i, unsigned sectors,
887                                  unsigned *whiteout_u64s)
888 {
889         struct bkey_packed *k, *prev = NULL;
890         struct bpos prev_pos = POS_MIN;
891         bool seen_non_whiteout = false;
892
893         if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
894                 return "unsupported bset version";
895
896         if (b->written + sectors > c->sb.btree_node_size)
897                 return  "bset past end of btree node";
898
899         if (i != &b->data->keys && !i->u64s)
900                 btree_node_error(b, c, ptr, "empty set");
901
902         if (!BSET_SEPARATE_WHITEOUTS(i)) {
903                 seen_non_whiteout = true;
904                 whiteout_u64s = 0;
905         }
906
907         for (k = i->start;
908              k != vstruct_last(i);) {
909                 struct bkey_s_c u;
910                 struct bkey tmp;
911                 const char *invalid;
912
913                 if (!k->u64s) {
914                         btree_node_error(b, c, ptr,
915                                 "KEY_U64s 0: %zu bytes of metadata lost",
916                                 vstruct_end(i) - (void *) k);
917
918                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
919                         break;
920                 }
921
922                 if (bkey_next(k) > vstruct_last(i)) {
923                         btree_node_error(b, c, ptr,
924                                          "key extends past end of bset");
925
926                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
927                         break;
928                 }
929
930                 if (k->format > KEY_FORMAT_CURRENT) {
931                         btree_node_error(b, c, ptr,
932                                          "invalid bkey format %u", k->format);
933
934                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
935                         memmove_u64s_down(k, bkey_next(k),
936                                           (u64 *) vstruct_end(i) - (u64 *) k);
937                         continue;
938                 }
939
940                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
941                         bch_bkey_swab(btree_node_type(b), &b->format, k);
942
943                 u = bkey_disassemble(b, k, &tmp);
944
945                 invalid = btree_bkey_invalid(c, b, u);
946                 if (invalid) {
947                         char buf[160];
948
949                         bch_bkey_val_to_text(c, btree_node_type(b),
950                                              buf, sizeof(buf), u);
951                         btree_node_error(b, c, ptr,
952                                          "invalid bkey %s: %s", buf, invalid);
953
954                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
955                         memmove_u64s_down(k, bkey_next(k),
956                                           (u64 *) vstruct_end(i) - (u64 *) k);
957                         continue;
958                 }
959
960                 /*
961                  * with the separate whiteouts thing (used for extents), the
962                  * second set of keys actually can have whiteouts too, so we
963                  * can't solely go off bkey_whiteout()...
964                  */
965
966                 if (!seen_non_whiteout &&
967                     (!bkey_whiteout(k) ||
968                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
969                         *whiteout_u64s = k->_data - i->_data;
970                         seen_non_whiteout = true;
971                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
972                         btree_node_error(b, c, ptr,
973                                          "keys out of order: %llu:%llu > %llu:%llu",
974                                          prev_pos.inode,
975                                          prev_pos.offset,
976                                          u.k->p.inode,
977                                          bkey_start_offset(u.k));
978                         /* XXX: repair this */
979                 }
980
981                 prev_pos = u.k->p;
982                 prev = k;
983                 k = bkey_next(k);
984         }
985
986         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
987         return NULL;
988 }
989
990 static bool extent_contains_ptr(struct bkey_s_c_extent e,
991                                 struct bch_extent_ptr match)
992 {
993         const struct bch_extent_ptr *ptr;
994
995         extent_for_each_ptr(e, ptr)
996                 if (!memcmp(ptr, &match, sizeof(*ptr)))
997                         return true;
998
999         return false;
1000 }
1001
1002 void bch_btree_node_read_done(struct cache_set *c, struct btree *b,
1003                               struct cache *ca,
1004                               const struct bch_extent_ptr *ptr)
1005 {
1006         struct btree_node_entry *bne;
1007         struct bset *i = &b->data->keys;
1008         struct btree_node_iter *iter;
1009         struct btree_node *sorted;
1010         bool used_mempool;
1011         unsigned u64s;
1012         const char *err;
1013         struct bch_csum csum;
1014         struct nonce nonce;
1015         int ret;
1016
1017         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1018         __bch_btree_node_iter_init(iter, btree_node_is_extents(b));
1019
1020         err = "dynamic fault";
1021         if (bch_meta_read_fault("btree"))
1022                 goto err;
1023
1024         while (b->written < c->sb.btree_node_size) {
1025                 unsigned sectors, whiteout_u64s = 0;
1026
1027                 if (!b->written) {
1028                         i = &b->data->keys;
1029
1030                         err = "bad magic";
1031                         if (le64_to_cpu(b->data->magic) != bset_magic(c))
1032                                 goto err;
1033
1034                         err = "bad btree header";
1035                         if (!b->data->keys.seq)
1036                                 goto err;
1037
1038                         err = "unknown checksum type";
1039                         if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1040                                 goto err;
1041
1042                         /* XXX: retry checksum errors */
1043
1044                         nonce = btree_nonce(b, i, b->written << 9);
1045                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1046
1047                         err = "bad checksum";
1048                         if (bch_crc_cmp(csum, b->data->csum))
1049                                 goto err;
1050
1051                         bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1052                                     &b->data->flags,
1053                                     (void *) &b->data->keys -
1054                                     (void *) &b->data->flags);
1055                         nonce = nonce_add(nonce,
1056                                           round_up((void *) &b->data->keys -
1057                                                    (void *) &b->data->flags,
1058                                                    CHACHA20_BLOCK_SIZE));
1059                         bset_encrypt(c, i, nonce);
1060
1061                         sectors = vstruct_sectors(b->data, c->block_bits);
1062
1063                         if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
1064                                 u64 *p = (u64 *) &b->data->ptr;
1065
1066                                 *p = swab64(*p);
1067                                 bch_bpos_swab(&b->data->min_key);
1068                                 bch_bpos_swab(&b->data->max_key);
1069                         }
1070
1071                         err = "incorrect btree id";
1072                         if (BTREE_NODE_ID(b->data) != b->btree_id)
1073                                 goto err;
1074
1075                         err = "incorrect level";
1076                         if (BTREE_NODE_LEVEL(b->data) != b->level)
1077                                 goto err;
1078
1079                         err = "incorrect max key";
1080                         if (bkey_cmp(b->data->max_key, b->key.k.p))
1081                                 goto err;
1082
1083                         err = "incorrect backpointer";
1084                         if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
1085                                                  b->data->ptr))
1086                                 goto err;
1087
1088                         err = bch_bkey_format_validate(&b->data->format);
1089                         if (err)
1090                                 goto err;
1091
1092                         set_btree_bset(b, b->set, &b->data->keys);
1093
1094                         btree_node_set_format(b, b->data->format);
1095                 } else {
1096                         bne = write_block(b);
1097                         i = &bne->keys;
1098
1099                         if (i->seq != b->data->keys.seq)
1100                                 break;
1101
1102                         err = "unknown checksum type";
1103                         if (!bch_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1104                                 goto err;
1105
1106                         nonce = btree_nonce(b, i, b->written << 9);
1107                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1108
1109                         err = "bad checksum";
1110                         if (memcmp(&csum, &bne->csum, sizeof(csum)))
1111                                 goto err;
1112
1113                         bset_encrypt(c, i, nonce);
1114
1115                         sectors = vstruct_sectors(bne, c->block_bits);
1116                 }
1117
1118                 err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
1119                 if (err)
1120                         goto err;
1121
1122                 b->written += sectors;
1123
1124                 err = "insufficient memory";
1125                 ret = bch_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1126                 if (ret < 0)
1127                         goto err;
1128
1129                 if (ret)
1130                         continue;
1131
1132                 __bch_btree_node_iter_push(iter, b,
1133                                            i->start,
1134                                            vstruct_idx(i, whiteout_u64s));
1135
1136                 __bch_btree_node_iter_push(iter, b,
1137                                            vstruct_idx(i, whiteout_u64s),
1138                                            vstruct_last(i));
1139         }
1140
1141         err = "corrupted btree";
1142         for (bne = write_block(b);
1143              bset_byte_offset(b, bne) < btree_bytes(c);
1144              bne = (void *) bne + block_bytes(c))
1145                 if (bne->keys.seq == b->data->keys.seq)
1146                         goto err;
1147
1148         sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
1149         sorted->keys.u64s = 0;
1150
1151         b->nr = btree_node_is_extents(b)
1152                 ? bch_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1153                 : bch_key_sort_fix_overlapping(&sorted->keys, b, iter);
1154
1155         u64s = le16_to_cpu(sorted->keys.u64s);
1156         *sorted = *b->data;
1157         sorted->keys.u64s = cpu_to_le16(u64s);
1158         swap(sorted, b->data);
1159         set_btree_bset(b, b->set, &b->data->keys);
1160         b->nsets = 1;
1161
1162         BUG_ON(b->nr.live_u64s != u64s);
1163
1164         btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
1165
1166         bch_bset_build_aux_tree(b, b->set, false);
1167
1168         set_needs_whiteout(btree_bset_first(b));
1169
1170         btree_node_reset_sib_u64s(b);
1171 out:
1172         mempool_free(iter, &c->fill_iter);
1173         return;
1174 err:
1175         set_btree_node_read_error(b);
1176         btree_node_error(b, c, ptr, "%s", err);
1177         goto out;
1178 }
1179
1180 static void btree_node_read_endio(struct bio *bio)
1181 {
1182         closure_put(bio->bi_private);
1183 }
1184
1185 void bch_btree_node_read(struct cache_set *c, struct btree *b)
1186 {
1187         uint64_t start_time = local_clock();
1188         struct closure cl;
1189         struct bio *bio;
1190         struct extent_pick_ptr pick;
1191
1192         trace_bcache_btree_read(c, b);
1193
1194         closure_init_stack(&cl);
1195
1196         pick = bch_btree_pick_ptr(c, b);
1197         if (cache_set_fatal_err_on(!pick.ca, c,
1198                                    "no cache device for btree node")) {
1199                 set_btree_node_read_error(b);
1200                 return;
1201         }
1202
1203         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
1204         bio->bi_bdev            = pick.ca->disk_sb.bdev;
1205         bio->bi_iter.bi_sector  = pick.ptr.offset;
1206         bio->bi_iter.bi_size    = btree_bytes(c);
1207         bio->bi_end_io          = btree_node_read_endio;
1208         bio->bi_private         = &cl;
1209         bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
1210
1211         bch_bio_map(bio, b->data);
1212
1213         closure_get(&cl);
1214         bch_generic_make_request(bio, c);
1215         closure_sync(&cl);
1216
1217         if (cache_fatal_io_err_on(bio->bi_error,
1218                                   pick.ca, "IO error reading bucket %zu",
1219                                   PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
1220             bch_meta_read_fault("btree")) {
1221                 set_btree_node_read_error(b);
1222                 goto out;
1223         }
1224
1225         bch_btree_node_read_done(c, b, pick.ca, &pick.ptr);
1226         bch_time_stats_update(&c->btree_read_time, start_time);
1227 out:
1228         bio_put(bio);
1229         percpu_ref_put(&pick.ca->ref);
1230 }
1231
1232 int bch_btree_root_read(struct cache_set *c, enum btree_id id,
1233                         const struct bkey_i *k, unsigned level)
1234 {
1235         struct closure cl;
1236         struct btree *b;
1237         int ret;
1238
1239         closure_init_stack(&cl);
1240
1241         do {
1242                 ret = mca_cannibalize_lock(c, &cl);
1243                 closure_sync(&cl);
1244         } while (ret);
1245
1246         b = mca_alloc(c);
1247         mca_cannibalize_unlock(c);
1248
1249         BUG_ON(IS_ERR(b));
1250
1251         bkey_copy(&b->key, k);
1252         BUG_ON(mca_hash_insert(c, b, level, id));
1253
1254         bch_btree_node_read(c, b);
1255         six_unlock_write(&b->lock);
1256
1257         if (btree_node_read_error(b)) {
1258                 six_unlock_intent(&b->lock);
1259                 return -EIO;
1260         }
1261
1262         bch_btree_set_root_initial(c, b, NULL);
1263         six_unlock_intent(&b->lock);
1264
1265         return 0;
1266 }
1267
1268 void bch_btree_complete_write(struct cache_set *c, struct btree *b,
1269                               struct btree_write *w)
1270 {
1271         bch_journal_pin_drop(&c->journal, &w->journal);
1272         closure_wake_up(&w->wait);
1273 }
1274
1275 static void btree_node_write_done(struct cache_set *c, struct btree *b)
1276 {
1277         struct btree_write *w = btree_prev_write(b);
1278
1279         /*
1280          * Before calling bch_btree_complete_write() - if the write errored, we
1281          * have to halt new journal writes before they see this btree node
1282          * write as completed:
1283          */
1284         if (btree_node_write_error(b))
1285                 bch_journal_halt(&c->journal);
1286
1287         bch_btree_complete_write(c, b, w);
1288         btree_node_io_unlock(b);
1289 }
1290
1291 static void btree_node_write_endio(struct bio *bio)
1292 {
1293         struct btree *b = bio->bi_private;
1294         struct bch_write_bio *wbio = to_wbio(bio);
1295         struct cache_set *c     = wbio->c;
1296         struct bio *orig        = wbio->split ? wbio->orig : NULL;
1297         struct closure *cl      = !wbio->split ? wbio->cl : NULL;
1298         struct cache *ca        = wbio->ca;
1299
1300         if (cache_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
1301             bch_meta_write_fault("btree"))
1302                 set_btree_node_write_error(b);
1303
1304         if (wbio->bounce)
1305                 btree_bounce_free(c,
1306                         wbio->order,
1307                         wbio->used_mempool,
1308                         page_address(bio->bi_io_vec[0].bv_page));
1309
1310         if (wbio->put_bio)
1311                 bio_put(bio);
1312
1313         if (orig) {
1314                 bio_endio(orig);
1315         } else {
1316                 btree_node_write_done(c, b);
1317                 if (cl)
1318                         closure_put(cl);
1319         }
1320
1321         if (ca)
1322                 percpu_ref_put(&ca->ref);
1323 }
1324
1325 void __bch_btree_node_write(struct cache_set *c, struct btree *b,
1326                             struct closure *parent,
1327                             enum six_lock_type lock_type_held,
1328                             int idx_to_write)
1329 {
1330         struct bio *bio;
1331         struct bch_write_bio *wbio;
1332         struct bset_tree *t;
1333         struct bset *i;
1334         struct btree_node *bn = NULL;
1335         struct btree_node_entry *bne = NULL;
1336         BKEY_PADDED(key) k;
1337         struct bkey_s_extent e;
1338         struct bch_extent_ptr *ptr;
1339         struct cache *ca;
1340         struct sort_iter sort_iter;
1341         struct nonce nonce;
1342         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1343         u64 seq = 0;
1344         bool used_mempool;
1345         unsigned long old, new;
1346         void *data;
1347
1348         /*
1349          * We may only have a read lock on the btree node - the dirty bit is our
1350          * "lock" against racing with other threads that may be trying to start
1351          * a write, we do a write iff we clear the dirty bit. Since setting the
1352          * dirty bit requires a write lock, we can't race with other threads
1353          * redirtying it:
1354          */
1355         do {
1356                 old = new = READ_ONCE(b->flags);
1357
1358                 if (!(old & (1 << BTREE_NODE_dirty)))
1359                         return;
1360
1361                 if (idx_to_write >= 0 &&
1362                     idx_to_write != !!(old & (1 << BTREE_NODE_write_idx)))
1363                         return;
1364
1365                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1366                         wait_on_bit_io(&b->flags,
1367                                        BTREE_NODE_write_in_flight,
1368                                        TASK_UNINTERRUPTIBLE);
1369                         continue;
1370                 }
1371
1372                 new &= ~(1 << BTREE_NODE_dirty);
1373                 new |=  (1 << BTREE_NODE_write_in_flight);
1374                 new |=  (1 << BTREE_NODE_just_written);
1375                 new ^=  (1 << BTREE_NODE_write_idx);
1376         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1377
1378         BUG_ON(!list_empty(&b->write_blocked));
1379
1380         BUG_ON(b->written >= c->sb.btree_node_size);
1381         BUG_ON(bset_written(b, btree_bset_last(b)));
1382         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1383         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1384
1385         if (lock_type_held == SIX_LOCK_intent) {
1386                 six_lock_write(&b->lock);
1387                 __bch_compact_whiteouts(c, b, COMPACT_WRITTEN);
1388                 six_unlock_write(&b->lock);
1389         } else {
1390                 __bch_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1391         }
1392
1393         BUG_ON(b->uncompacted_whiteout_u64s);
1394
1395         sort_iter_init(&sort_iter, b);
1396
1397         bytes = !b->written
1398                 ? sizeof(struct btree_node)
1399                 : sizeof(struct btree_node_entry);
1400
1401         bytes += b->whiteout_u64s * sizeof(u64);
1402
1403         for_each_bset(b, t) {
1404                 i = bset(b, t);
1405
1406                 if (bset_written(b, i))
1407                         continue;
1408
1409                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1410                 sort_iter_add(&sort_iter,
1411                               btree_bkey_first(b, t),
1412                               btree_bkey_last(b, t));
1413                 seq = max(seq, le64_to_cpu(i->journal_seq));
1414         }
1415
1416         order = get_order(bytes);
1417         data = btree_bounce_alloc(c, order, &used_mempool);
1418
1419         if (!b->written) {
1420                 bn = data;
1421                 *bn = *b->data;
1422                 i = &bn->keys;
1423         } else {
1424                 bne = data;
1425                 bne->keys = b->data->keys;
1426                 i = &bne->keys;
1427         }
1428
1429         i->journal_seq  = cpu_to_le64(seq);
1430         i->u64s         = 0;
1431
1432         if (!btree_node_is_extents(b)) {
1433                 sort_iter_add(&sort_iter,
1434                               unwritten_whiteouts_start(c, b),
1435                               unwritten_whiteouts_end(c, b));
1436                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1437         } else {
1438                 memcpy_u64s(i->start,
1439                             unwritten_whiteouts_start(c, b),
1440                             b->whiteout_u64s);
1441                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1442                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1443         }
1444
1445         b->whiteout_u64s = 0;
1446
1447         u64s = btree_node_is_extents(b)
1448                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1449                 : sort_keys(i->start, &sort_iter, false);
1450         le16_add_cpu(&i->u64s, u64s);
1451
1452         clear_needs_whiteout(i);
1453
1454         if (b->written && !i->u64s) {
1455                 /* Nothing to write: */
1456                 btree_bounce_free(c, order, used_mempool, data);
1457                 btree_node_write_done(c, b);
1458                 return;
1459         }
1460
1461         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1462         BUG_ON(i->seq != b->data->keys.seq);
1463
1464         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1465         SET_BSET_CSUM_TYPE(i, bch_meta_checksum_type(c));
1466
1467         nonce = btree_nonce(b, i, b->written << 9);
1468
1469         if (bn) {
1470                 bch_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1471                             &bn->flags,
1472                             (void *) &b->data->keys -
1473                             (void *) &b->data->flags);
1474                 nonce = nonce_add(nonce,
1475                                   round_up((void *) &b->data->keys -
1476                                            (void *) &b->data->flags,
1477                                            CHACHA20_BLOCK_SIZE));
1478                 bset_encrypt(c, i, nonce);
1479
1480                 nonce = btree_nonce(b, i, b->written << 9);
1481                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1482         } else {
1483                 bset_encrypt(c, i, nonce);
1484
1485                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1486         }
1487
1488         bytes_to_write = vstruct_end(i) - data;
1489         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1490
1491         memset(data + bytes_to_write, 0,
1492                (sectors_to_write << 9) - bytes_to_write);
1493
1494         BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
1495
1496         trace_bcache_btree_write(b, bytes_to_write, sectors_to_write);
1497
1498         /*
1499          * We handle btree write errors by immediately halting the journal -
1500          * after we've done that, we can't issue any subsequent btree writes
1501          * because they might have pointers to new nodes that failed to write.
1502          *
1503          * Furthermore, there's no point in doing any more btree writes because
1504          * with the journal stopped, we're never going to update the journal to
1505          * reflect that those writes were done and the data flushed from the
1506          * journal:
1507          *
1508          * Make sure to update b->written so bch_btree_init_next() doesn't
1509          * break:
1510          */
1511         if (bch_journal_error(&c->journal) ||
1512             c->opts.nochanges) {
1513                 set_btree_node_noevict(b);
1514                 b->written += sectors_to_write;
1515
1516                 btree_bounce_free(c, order, used_mempool, data);
1517                 btree_node_write_done(c, b);
1518                 return;
1519         }
1520
1521         bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
1522
1523         wbio                    = to_wbio(bio);
1524         wbio->cl                = parent;
1525         wbio->bounce            = true;
1526         wbio->put_bio           = true;
1527         wbio->order             = order;
1528         wbio->used_mempool      = used_mempool;
1529         bio->bi_iter.bi_size    = sectors_to_write << 9;
1530         bio->bi_end_io          = btree_node_write_endio;
1531         bio->bi_private         = b;
1532         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
1533
1534         if (parent)
1535                 closure_get(parent);
1536
1537         bch_bio_map(bio, data);
1538
1539         /*
1540          * If we're appending to a leaf node, we don't technically need FUA -
1541          * this write just needs to be persisted before the next journal write,
1542          * which will be marked FLUSH|FUA.
1543          *
1544          * Similarly if we're writing a new btree root - the pointer is going to
1545          * be in the next journal entry.
1546          *
1547          * But if we're writing a new btree node (that isn't a root) or
1548          * appending to a non leaf btree node, we need either FUA or a flush
1549          * when we write the parent with the new pointer. FUA is cheaper than a
1550          * flush, and writes appending to leaf nodes aren't blocking anything so
1551          * just make all btree node writes FUA to keep things sane.
1552          */
1553
1554         bkey_copy(&k.key, &b->key);
1555         e = bkey_i_to_s_extent(&k.key);
1556
1557         extent_for_each_ptr(e, ptr)
1558                 ptr->offset += b->written;
1559
1560         rcu_read_lock();
1561         extent_for_each_online_device(c, e, ptr, ca)
1562                 atomic64_add(sectors_to_write, &ca->btree_sectors_written);
1563         rcu_read_unlock();
1564
1565         b->written += sectors_to_write;
1566
1567         bch_submit_wbio_replicas(wbio, c, &k.key, true);
1568 }
1569
1570 /*
1571  * Work that must be done with write lock held:
1572  */
1573 bool bch_btree_post_write_cleanup(struct cache_set *c, struct btree *b)
1574 {
1575         bool invalidated_iter = false;
1576         struct btree_node_entry *bne;
1577         struct bset_tree *t;
1578
1579         if (!btree_node_just_written(b))
1580                 return false;
1581
1582         BUG_ON(b->whiteout_u64s);
1583         BUG_ON(b->uncompacted_whiteout_u64s);
1584
1585         clear_btree_node_just_written(b);
1586
1587         /*
1588          * Note: immediately after write, bset_unwritten()/bset_written() don't
1589          * work - the amount of data we had to write after compaction might have
1590          * been smaller than the offset of the last bset.
1591          *
1592          * However, we know that all bsets have been written here, as long as
1593          * we're still holding the write lock:
1594          */
1595
1596         /*
1597          * XXX: decide if we really want to unconditionally sort down to a
1598          * single bset:
1599          */
1600         if (b->nsets > 1) {
1601                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1602                 invalidated_iter = true;
1603         } else {
1604                 invalidated_iter = bch_drop_whiteouts(b);
1605         }
1606
1607         for_each_bset(b, t)
1608                 set_needs_whiteout(bset(b, t));
1609
1610         bch_btree_verify(c, b);
1611
1612         /*
1613          * If later we don't unconditionally sort down to a single bset, we have
1614          * to ensure this is still true:
1615          */
1616         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1617
1618         bne = want_new_bset(c, b);
1619         if (bne)
1620                 bch_bset_init_next(b, &bne->keys);
1621
1622         bch_btree_build_aux_trees(b);
1623
1624         return invalidated_iter;
1625 }
1626
1627 /*
1628  * Use this one if the node is intent locked:
1629  */
1630 void bch_btree_node_write(struct cache_set *c, struct btree *b,
1631                           struct closure *parent,
1632                           enum six_lock_type lock_type_held,
1633                           int idx_to_write)
1634 {
1635         BUG_ON(lock_type_held == SIX_LOCK_write);
1636
1637         if (lock_type_held == SIX_LOCK_intent ||
1638             six_trylock_convert(&b->lock, SIX_LOCK_read,
1639                                 SIX_LOCK_intent)) {
1640                 __bch_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write);
1641
1642                 six_lock_write(&b->lock);
1643                 bch_btree_post_write_cleanup(c, b);
1644                 six_unlock_write(&b->lock);
1645
1646                 if (lock_type_held == SIX_LOCK_read)
1647                         six_lock_downgrade(&b->lock);
1648         } else {
1649                 __bch_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write);
1650         }
1651 }
1652
1653 static void bch_btree_node_write_dirty(struct cache_set *c, struct btree *b,
1654                                        struct closure *parent)
1655 {
1656         six_lock_read(&b->lock);
1657         BUG_ON(b->level);
1658
1659         bch_btree_node_write(c, b, parent, SIX_LOCK_read, -1);
1660         six_unlock_read(&b->lock);
1661 }
1662
1663 /*
1664  * Write all dirty btree nodes to disk, including roots
1665  */
1666 void bch_btree_flush(struct cache_set *c)
1667 {
1668         struct closure cl;
1669         struct btree *b;
1670         struct bucket_table *tbl;
1671         struct rhash_head *pos;
1672         bool dropped_lock;
1673         unsigned i;
1674
1675         closure_init_stack(&cl);
1676
1677         rcu_read_lock();
1678
1679         do {
1680                 dropped_lock = false;
1681                 i = 0;
1682 restart:
1683                 tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
1684                                           &c->btree_cache_table);
1685
1686                 for (; i < tbl->size; i++)
1687                         rht_for_each_entry_rcu(b, pos, tbl, i, hash)
1688                                 /*
1689                                  * XXX - locking for b->level, when called from
1690                                  * bch_journal_move()
1691                                  */
1692                                 if (!b->level && btree_node_dirty(b)) {
1693                                         rcu_read_unlock();
1694                                         bch_btree_node_write_dirty(c, b, &cl);
1695                                         dropped_lock = true;
1696                                         rcu_read_lock();
1697                                         goto restart;
1698                                 }
1699         } while (dropped_lock);
1700
1701         rcu_read_unlock();
1702
1703         closure_sync(&cl);
1704 }
1705
1706 /**
1707  * bch_btree_node_flush_journal - flush any journal entries that contain keys
1708  * from this node
1709  *
1710  * The bset's journal sequence number is used for preserving ordering of index
1711  * updates across unclean shutdowns - it's used to ignore bsets newer than the
1712  * most recent journal entry.
1713  *
1714  * But when rewriting btree nodes we compact all the bsets in a btree node - and
1715  * if we compacted a bset that should be ignored with bsets we do need, that
1716  * would be bad. So to avoid that, prior to making the new node visible ensure
1717  * that the journal has been flushed so that all the bsets we compacted should
1718  * be visible.
1719  */
1720 void bch_btree_node_flush_journal_entries(struct cache_set *c,
1721                                           struct btree *b,
1722                                           struct closure *cl)
1723 {
1724         int i = b->nsets;
1725
1726         /*
1727          * Journal sequence numbers in the different bsets will always be in
1728          * ascending order, we only need to flush the highest - except that the
1729          * most recent bset might not have a journal sequence number yet, so we
1730          * need to loop:
1731          */
1732         while (i--) {
1733                 u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
1734
1735                 if (seq) {
1736                         bch_journal_flush_seq_async(&c->journal, seq, cl);
1737                         break;
1738                 }
1739         }
1740 }