]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_io.c
38c373c69d20c63a1a48367212c7eb63918c42dd
[bcachefs-tools-debian] / libbcachefs / btree_io.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_iter.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "buckets.h"
11 #include "checksum.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "super-io.h"
18
19 #include <trace/events/bcachefs.h>
20
21 static void verify_no_dups(struct btree *b,
22                            struct bkey_packed *start,
23                            struct bkey_packed *end)
24 {
25 #ifdef CONFIG_BCACHEFS_DEBUG
26         struct bkey_packed *k;
27
28         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
29                 struct bkey l = bkey_unpack_key(b, k);
30                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
31
32                 BUG_ON(btree_node_is_extents(b)
33                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
34                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
35                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
36         }
37 #endif
38 }
39
40 static void clear_needs_whiteout(struct bset *i)
41 {
42         struct bkey_packed *k;
43
44         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
45                 k->needs_whiteout = false;
46 }
47
48 static void set_needs_whiteout(struct bset *i)
49 {
50         struct bkey_packed *k;
51
52         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
53                 k->needs_whiteout = true;
54 }
55
56 static void btree_bounce_free(struct bch_fs *c, unsigned order,
57                               bool used_mempool, void *p)
58 {
59         if (used_mempool)
60                 mempool_free(p, &c->btree_bounce_pool);
61         else
62                 vpfree(p, PAGE_SIZE << order);
63 }
64
65 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
66                                 bool *used_mempool)
67 {
68         void *p;
69
70         BUG_ON(order > btree_page_order(c));
71
72         *used_mempool = false;
73         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
74         if (p)
75                 return p;
76
77         *used_mempool = true;
78         return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
79 }
80
81 typedef int (*sort_cmp_fn)(struct btree *,
82                            struct bkey_packed *,
83                            struct bkey_packed *);
84
85 struct sort_iter {
86         struct btree    *b;
87         unsigned                used;
88
89         struct sort_iter_set {
90                 struct bkey_packed *k, *end;
91         } data[MAX_BSETS + 1];
92 };
93
94 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
95 {
96         memset(iter, 0, sizeof(*iter));
97         iter->b = b;
98 }
99
100 static inline void __sort_iter_sift(struct sort_iter *iter,
101                                     unsigned from,
102                                     sort_cmp_fn cmp)
103 {
104         unsigned i;
105
106         for (i = from;
107              i + 1 < iter->used &&
108              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
109              i++)
110                 swap(iter->data[i], iter->data[i + 1]);
111 }
112
113 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
114 {
115
116         __sort_iter_sift(iter, 0, cmp);
117 }
118
119 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
120 {
121         unsigned i = iter->used;
122
123         while (i--)
124                 __sort_iter_sift(iter, i, cmp);
125 }
126
127 static void sort_iter_add(struct sort_iter *iter,
128                           struct bkey_packed *k,
129                           struct bkey_packed *end)
130 {
131         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
132
133         if (k != end)
134                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
135 }
136
137 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
138 {
139         return iter->used ? iter->data->k : NULL;
140 }
141
142 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
143 {
144         iter->data->k = bkey_next(iter->data->k);
145
146         BUG_ON(iter->data->k > iter->data->end);
147
148         if (iter->data->k == iter->data->end)
149                 array_remove_item(iter->data, iter->used, 0);
150         else
151                 sort_iter_sift(iter, cmp);
152 }
153
154 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
155                                                  sort_cmp_fn cmp)
156 {
157         struct bkey_packed *ret = sort_iter_peek(iter);
158
159         if (ret)
160                 sort_iter_advance(iter, cmp);
161
162         return ret;
163 }
164
165 static inline int sort_key_whiteouts_cmp(struct btree *b,
166                                          struct bkey_packed *l,
167                                          struct bkey_packed *r)
168 {
169         return bkey_cmp_packed(b, l, r);
170 }
171
172 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
173                                    struct sort_iter *iter)
174 {
175         struct bkey_packed *in, *out = dst;
176
177         sort_iter_sort(iter, sort_key_whiteouts_cmp);
178
179         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
180                 bkey_copy(out, in);
181                 out = bkey_next(out);
182         }
183
184         return (u64 *) out - (u64 *) dst;
185 }
186
187 static inline int sort_extent_whiteouts_cmp(struct btree *b,
188                                             struct bkey_packed *l,
189                                             struct bkey_packed *r)
190 {
191         struct bkey ul = bkey_unpack_key(b, l);
192         struct bkey ur = bkey_unpack_key(b, r);
193
194         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
195 }
196
197 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
198                                       struct sort_iter *iter)
199 {
200         const struct bkey_format *f = &iter->b->format;
201         struct bkey_packed *in, *out = dst;
202         struct bkey_i l, r;
203         bool prev = false, l_packed = false;
204         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
205         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
206         u64 new_size;
207
208         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
209
210         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
211
212         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
213                 EBUG_ON(bkeyp_val_u64s(f, in));
214                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
215
216                 r.k = bkey_unpack_key(iter->b, in);
217
218                 if (prev &&
219                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
220                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
221                                 continue;
222
223                         new_size = l_packed
224                                 ? min(max_packed_size, max_packed_offset -
225                                       bkey_start_offset(&l.k))
226                                 : KEY_SIZE_MAX;
227
228                         new_size = min(new_size, r.k.p.offset -
229                                        bkey_start_offset(&l.k));
230
231                         BUG_ON(new_size < l.k.size);
232
233                         bch2_key_resize(&l.k, new_size);
234
235                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
236                                 continue;
237
238                         bch2_cut_front(l.k.p, &r);
239                 }
240
241                 if (prev) {
242                         if (!bch2_bkey_pack(out, &l, f)) {
243                                 BUG_ON(l_packed);
244                                 bkey_copy(out, &l);
245                         }
246                         out = bkey_next(out);
247                 }
248
249                 l = r;
250                 prev = true;
251                 l_packed = bkey_packed(in);
252         }
253
254         if (prev) {
255                 if (!bch2_bkey_pack(out, &l, f)) {
256                         BUG_ON(l_packed);
257                         bkey_copy(out, &l);
258                 }
259                 out = bkey_next(out);
260         }
261
262         return (u64 *) out - (u64 *) dst;
263 }
264
265 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
266                                     bool compacting,
267                                     enum compact_mode mode)
268 {
269         unsigned live_u64s = b->nr.bset_u64s[t - b->set];
270         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
271
272         if (live_u64s == bset_u64s)
273                 return 0;
274
275         if (mode == COMPACT_LAZY) {
276                 if (live_u64s * 4 < bset_u64s * 3 ||
277                     (compacting && bset_unwritten(b, bset(b, t))))
278                         return bset_u64s - live_u64s;
279         } else {
280                 if (bset_written(b, bset(b, t)))
281                         return bset_u64s - live_u64s;
282         }
283
284         return 0;
285 }
286
287 bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
288                              enum compact_mode mode)
289 {
290         const struct bkey_format *f = &b->format;
291         struct bset_tree *t;
292         struct bkey_packed *whiteouts = NULL;
293         struct bkey_packed *u_start, *u_pos;
294         struct sort_iter sort_iter;
295         unsigned order, whiteout_u64s = 0, u64s;
296         bool used_mempool, compacting = false;
297
298         for_each_bset(b, t)
299                 whiteout_u64s += should_compact_bset(b, t,
300                                         whiteout_u64s != 0, mode);
301
302         if (!whiteout_u64s)
303                 return false;
304
305         sort_iter_init(&sort_iter, b);
306
307         whiteout_u64s += b->whiteout_u64s;
308         order = get_order(whiteout_u64s * sizeof(u64));
309
310         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
311         u_start = u_pos = whiteouts;
312
313         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
314                     b->whiteout_u64s);
315         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
316
317         sort_iter_add(&sort_iter, u_start, u_pos);
318
319         for_each_bset(b, t) {
320                 struct bset *i = bset(b, t);
321                 struct bkey_packed *k, *n, *out, *start, *end;
322                 struct btree_node_entry *src = NULL, *dst = NULL;
323
324                 if (t != b->set && bset_unwritten(b, i)) {
325                         src = container_of(i, struct btree_node_entry, keys);
326                         dst = max(write_block(b),
327                                   (void *) btree_bkey_last(b, t -1));
328                 }
329
330                 if (!should_compact_bset(b, t, compacting, mode)) {
331                         if (src != dst) {
332                                 memmove(dst, src, sizeof(*src) +
333                                         le16_to_cpu(src->keys.u64s) *
334                                         sizeof(u64));
335                                 i = &dst->keys;
336                                 set_btree_bset(b, t, i);
337                         }
338                         continue;
339                 }
340
341                 compacting = true;
342                 u_start = u_pos;
343                 start = i->start;
344                 end = vstruct_last(i);
345
346                 if (src != dst) {
347                         memmove(dst, src, sizeof(*src));
348                         i = &dst->keys;
349                         set_btree_bset(b, t, i);
350                 }
351
352                 out = i->start;
353
354                 for (k = start; k != end; k = n) {
355                         n = bkey_next(k);
356
357                         if (bkey_deleted(k) && btree_node_is_extents(b))
358                                 continue;
359
360                         if (bkey_whiteout(k) && !k->needs_whiteout)
361                                 continue;
362
363                         if (bkey_whiteout(k)) {
364                                 unreserve_whiteout(b, t, k);
365                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
366                                 set_bkeyp_val_u64s(f, u_pos, 0);
367                                 u_pos = bkey_next(u_pos);
368                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
369                                 bkey_copy(out, k);
370                                 out = bkey_next(out);
371                         }
372                 }
373
374                 sort_iter_add(&sort_iter, u_start, u_pos);
375
376                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
377                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
378                         set_btree_bset_end(b, t);
379                         bch2_bset_set_no_aux_tree(b, t);
380                 }
381         }
382
383         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
384
385         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
386                (void *) btree_bkey_last(b, bset_tree_last(b)));
387
388         u64s = btree_node_is_extents(b)
389                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
390                                         &sort_iter)
391                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
392                                      &sort_iter);
393
394         BUG_ON(u64s > b->whiteout_u64s);
395         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
396         BUG_ON(u_pos != whiteouts && !u64s);
397
398         if (u64s != b->whiteout_u64s) {
399                 void *src = unwritten_whiteouts_start(c, b);
400
401                 b->whiteout_u64s = u64s;
402                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
403         }
404
405         verify_no_dups(b,
406                        unwritten_whiteouts_start(c, b),
407                        unwritten_whiteouts_end(c, b));
408
409         btree_bounce_free(c, order, used_mempool, whiteouts);
410
411         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
412                 bch2_btree_build_aux_trees(b);
413
414         bch_btree_keys_u64s_remaining(c, b);
415         bch2_verify_btree_nr_keys(b);
416
417         return true;
418 }
419
420 static bool bch2_drop_whiteouts(struct btree *b)
421 {
422         struct bset_tree *t;
423         bool ret = false;
424
425         for_each_bset(b, t) {
426                 struct bset *i = bset(b, t);
427                 struct bkey_packed *k, *n, *out, *start, *end;
428
429                 if (!should_compact_bset(b, t, true, true))
430                         continue;
431
432                 start   = btree_bkey_first(b, t);
433                 end     = btree_bkey_last(b, t);
434
435                 if (bset_unwritten(b, i) &&
436                     t != b->set) {
437                         struct bset *dst =
438                                max_t(struct bset *, write_block(b),
439                                      (void *) btree_bkey_last(b, t -1));
440
441                         memmove(dst, i, sizeof(struct bset));
442                         i = dst;
443                         set_btree_bset(b, t, i);
444                 }
445
446                 out = i->start;
447
448                 for (k = start; k != end; k = n) {
449                         n = bkey_next(k);
450
451                         if (!bkey_whiteout(k)) {
452                                 bkey_copy(out, k);
453                                 out = bkey_next(out);
454                         }
455                 }
456
457                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
458                 bch2_bset_set_no_aux_tree(b, t);
459                 ret = true;
460         }
461
462         bch2_verify_btree_nr_keys(b);
463
464         return ret;
465 }
466
467 static inline int sort_keys_cmp(struct btree *b,
468                                 struct bkey_packed *l,
469                                 struct bkey_packed *r)
470 {
471         return bkey_cmp_packed(b, l, r) ?:
472                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
473                 (int) l->needs_whiteout - (int) r->needs_whiteout;
474 }
475
476 static unsigned sort_keys(struct bkey_packed *dst,
477                           struct sort_iter *iter,
478                           bool filter_whiteouts)
479 {
480         const struct bkey_format *f = &iter->b->format;
481         struct bkey_packed *in, *next, *out = dst;
482
483         sort_iter_sort(iter, sort_keys_cmp);
484
485         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
486                 if (bkey_whiteout(in) &&
487                     (filter_whiteouts || !in->needs_whiteout))
488                         continue;
489
490                 if (bkey_whiteout(in) &&
491                     (next = sort_iter_peek(iter)) &&
492                     !bkey_cmp_packed(iter->b, in, next)) {
493                         BUG_ON(in->needs_whiteout &&
494                                next->needs_whiteout);
495                         /*
496                          * XXX racy, called with read lock from write path
497                          *
498                          * leads to spurious BUG_ON() in bkey_unpack_key() in
499                          * debug mode
500                          */
501                         next->needs_whiteout |= in->needs_whiteout;
502                         continue;
503                 }
504
505                 if (bkey_whiteout(in)) {
506                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
507                         set_bkeyp_val_u64s(f, out, 0);
508                 } else {
509                         bkey_copy(out, in);
510                 }
511                 out = bkey_next(out);
512         }
513
514         return (u64 *) out - (u64 *) dst;
515 }
516
517 static inline int sort_extents_cmp(struct btree *b,
518                                    struct bkey_packed *l,
519                                    struct bkey_packed *r)
520 {
521         return bkey_cmp_packed(b, l, r) ?:
522                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
523 }
524
525 static unsigned sort_extents(struct bkey_packed *dst,
526                              struct sort_iter *iter,
527                              bool filter_whiteouts)
528 {
529         struct bkey_packed *in, *out = dst;
530
531         sort_iter_sort(iter, sort_extents_cmp);
532
533         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
534                 if (bkey_deleted(in))
535                         continue;
536
537                 if (bkey_whiteout(in) &&
538                     (filter_whiteouts || !in->needs_whiteout))
539                         continue;
540
541                 bkey_copy(out, in);
542                 out = bkey_next(out);
543         }
544
545         return (u64 *) out - (u64 *) dst;
546 }
547
548 static void btree_node_sort(struct bch_fs *c, struct btree *b,
549                             struct btree_iter *iter,
550                             unsigned start_idx,
551                             unsigned end_idx,
552                             bool filter_whiteouts)
553 {
554         struct btree_node *out;
555         struct sort_iter sort_iter;
556         struct bset_tree *t;
557         struct bset *start_bset = bset(b, &b->set[start_idx]);
558         bool used_mempool = false;
559         u64 start_time;
560         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
561         bool sorting_entire_node = start_idx == 0 &&
562                 end_idx == b->nsets;
563
564         sort_iter_init(&sort_iter, b);
565
566         for (t = b->set + start_idx;
567              t < b->set + end_idx;
568              t++) {
569                 u64s += le16_to_cpu(bset(b, t)->u64s);
570                 sort_iter_add(&sort_iter,
571                               btree_bkey_first(b, t),
572                               btree_bkey_last(b, t));
573         }
574
575         order = sorting_entire_node
576                 ? btree_page_order(c)
577                 : get_order(__vstruct_bytes(struct btree_node, u64s));
578
579         out = btree_bounce_alloc(c, order, &used_mempool);
580
581         start_time = local_clock();
582
583         if (btree_node_is_extents(b))
584                 filter_whiteouts = bset_written(b, start_bset);
585
586         u64s = btree_node_is_extents(b)
587                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
588                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
589
590         out->keys.u64s = cpu_to_le16(u64s);
591
592         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
593
594         if (sorting_entire_node)
595                 bch2_time_stats_update(&c->btree_sort_time, start_time);
596
597         /* Make sure we preserve bset journal_seq: */
598         for (t = b->set + start_idx + 1;
599              t < b->set + end_idx;
600              t++)
601                 start_bset->journal_seq =
602                         max(start_bset->journal_seq,
603                             bset(b, t)->journal_seq);
604
605         if (sorting_entire_node) {
606                 unsigned u64s = le16_to_cpu(out->keys.u64s);
607
608                 BUG_ON(order != btree_page_order(c));
609
610                 /*
611                  * Our temporary buffer is the same size as the btree node's
612                  * buffer, we can just swap buffers instead of doing a big
613                  * memcpy()
614                  */
615                 *out = *b->data;
616                 out->keys.u64s = cpu_to_le16(u64s);
617                 swap(out, b->data);
618                 set_btree_bset(b, b->set, &b->data->keys);
619         } else {
620                 start_bset->u64s = out->keys.u64s;
621                 memcpy_u64s(start_bset->start,
622                             out->keys.start,
623                             le16_to_cpu(out->keys.u64s));
624         }
625
626         for (i = start_idx + 1; i < end_idx; i++)
627                 b->nr.bset_u64s[start_idx] +=
628                         b->nr.bset_u64s[i];
629
630         b->nsets -= shift;
631
632         for (i = start_idx + 1; i < b->nsets; i++) {
633                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
634                 b->set[i]               = b->set[i + shift];
635         }
636
637         for (i = b->nsets; i < MAX_BSETS; i++)
638                 b->nr.bset_u64s[i] = 0;
639
640         set_btree_bset_end(b, &b->set[start_idx]);
641         bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
642
643         btree_bounce_free(c, order, used_mempool, out);
644
645         bch2_verify_btree_nr_keys(b);
646 }
647
648 /* Sort + repack in a new format: */
649 static struct btree_nr_keys sort_repack(struct bset *dst,
650                                         struct btree *src,
651                                         struct btree_node_iter *src_iter,
652                                         struct bkey_format *out_f,
653                                         bool filter_whiteouts)
654 {
655         struct bkey_format *in_f = &src->format;
656         struct bkey_packed *in, *out = vstruct_last(dst);
657         struct btree_nr_keys nr;
658
659         memset(&nr, 0, sizeof(nr));
660
661         while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
662                 if (filter_whiteouts && bkey_whiteout(in))
663                         continue;
664
665                 if (bch2_bkey_transform(out_f, out, bkey_packed(in)
666                                        ? in_f : &bch2_bkey_format_current, in))
667                         out->format = KEY_FORMAT_LOCAL_BTREE;
668                 else
669                         bch2_bkey_unpack(src, (void *) out, in);
670
671                 btree_keys_account_key_add(&nr, 0, out);
672                 out = bkey_next(out);
673         }
674
675         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
676         return nr;
677 }
678
679 /* Sort, repack, and merge: */
680 static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
681                                               struct bset *dst,
682                                               struct btree *src,
683                                               struct btree_node_iter *iter,
684                                               struct bkey_format *out_f,
685                                               bool filter_whiteouts,
686                                               key_filter_fn filter,
687                                               key_merge_fn merge)
688 {
689         struct bkey_packed *k, *prev = NULL, *out;
690         struct btree_nr_keys nr;
691         BKEY_PADDED(k) tmp;
692
693         memset(&nr, 0, sizeof(nr));
694
695         while ((k = bch2_btree_node_iter_next_all(iter, src))) {
696                 if (filter_whiteouts && bkey_whiteout(k))
697                         continue;
698
699                 /*
700                  * The filter might modify pointers, so we have to unpack the
701                  * key and values to &tmp.k:
702                  */
703                 bch2_bkey_unpack(src, &tmp.k, k);
704
705                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
706                         continue;
707
708                 /* prev is always unpacked, for key merging: */
709
710                 if (prev &&
711                     merge &&
712                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
713                         continue;
714
715                 /*
716                  * the current key becomes the new prev: advance prev, then
717                  * copy the current key - but first pack prev (in place):
718                  */
719                 if (prev) {
720                         bch2_bkey_pack(prev, (void *) prev, out_f);
721
722                         btree_keys_account_key_add(&nr, 0, prev);
723                         prev = bkey_next(prev);
724                 } else {
725                         prev = vstruct_last(dst);
726                 }
727
728                 bkey_copy(prev, &tmp.k);
729         }
730
731         if (prev) {
732                 bch2_bkey_pack(prev, (void *) prev, out_f);
733                 btree_keys_account_key_add(&nr, 0, prev);
734                 out = bkey_next(prev);
735         } else {
736                 out = vstruct_last(dst);
737         }
738
739         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
740         return nr;
741 }
742
743 void bch2_btree_sort_into(struct bch_fs *c,
744                          struct btree *dst,
745                          struct btree *src)
746 {
747         struct btree_nr_keys nr;
748         struct btree_node_iter src_iter;
749         u64 start_time = local_clock();
750
751         BUG_ON(dst->nsets != 1);
752
753         bch2_bset_set_no_aux_tree(dst, dst->set);
754
755         bch2_btree_node_iter_init_from_start(&src_iter, src,
756                                             btree_node_is_extents(src));
757
758         if (btree_node_ops(src)->key_normalize ||
759             btree_node_ops(src)->key_merge)
760                 nr = sort_repack_merge(c, btree_bset_first(dst),
761                                 src, &src_iter,
762                                 &dst->format,
763                                 true,
764                                 btree_node_ops(src)->key_normalize,
765                                 btree_node_ops(src)->key_merge);
766         else
767                 nr = sort_repack(btree_bset_first(dst),
768                                 src, &src_iter,
769                                 &dst->format,
770                                 true);
771
772         bch2_time_stats_update(&c->btree_sort_time, start_time);
773
774         set_btree_bset_end(dst, dst->set);
775
776         dst->nr.live_u64s       += nr.live_u64s;
777         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
778         dst->nr.packed_keys     += nr.packed_keys;
779         dst->nr.unpacked_keys   += nr.unpacked_keys;
780
781         bch2_verify_btree_nr_keys(dst);
782 }
783
784 #define SORT_CRIT       (4096 / sizeof(u64))
785
786 /*
787  * We're about to add another bset to the btree node, so if there's currently
788  * too many bsets - sort some of them together:
789  */
790 static bool btree_node_compact(struct bch_fs *c, struct btree *b,
791                                struct btree_iter *iter)
792 {
793         unsigned unwritten_idx;
794         bool ret = false;
795
796         for (unwritten_idx = 0;
797              unwritten_idx < b->nsets;
798              unwritten_idx++)
799                 if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
800                         break;
801
802         if (b->nsets - unwritten_idx > 1) {
803                 btree_node_sort(c, b, iter, unwritten_idx,
804                                 b->nsets, false);
805                 ret = true;
806         }
807
808         if (unwritten_idx > 1) {
809                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
810                 ret = true;
811         }
812
813         return ret;
814 }
815
816 void bch2_btree_build_aux_trees(struct btree *b)
817 {
818         struct bset_tree *t;
819
820         for_each_bset(b, t)
821                 bch2_bset_build_aux_tree(b, t,
822                                 bset_unwritten(b, bset(b, t)) &&
823                                 t == bset_tree_last(b));
824 }
825
826 /*
827  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
828  * inserted into
829  *
830  * Safe to call if there already is an unwritten bset - will only add a new bset
831  * if @b doesn't already have one.
832  *
833  * Returns true if we sorted (i.e. invalidated iterators
834  */
835 void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
836                          struct btree_iter *iter)
837 {
838         struct btree_node_entry *bne;
839         bool did_sort;
840
841         EBUG_ON(!(b->lock.state.seq & 1));
842         EBUG_ON(iter && iter->nodes[b->level] != b);
843
844         did_sort = btree_node_compact(c, b, iter);
845
846         bne = want_new_bset(c, b);
847         if (bne)
848                 bch2_bset_init_next(b, &bne->keys);
849
850         bch2_btree_build_aux_trees(b);
851
852         if (iter && did_sort)
853                 bch2_btree_iter_reinit_node(iter, b);
854 }
855
856 static struct nonce btree_nonce(struct bset *i, unsigned offset)
857 {
858         return (struct nonce) {{
859                 [0] = cpu_to_le32(offset),
860                 [1] = ((__le32 *) &i->seq)[0],
861                 [2] = ((__le32 *) &i->seq)[1],
862                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
863         }};
864 }
865
866 static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
867 {
868         struct nonce nonce = btree_nonce(i, offset);
869
870         if (!offset) {
871                 struct btree_node *bn = container_of(i, struct btree_node, keys);
872                 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
873
874                 bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
875                              bytes);
876
877                 nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
878         }
879
880         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
881                      vstruct_end(i) - (void *) i->_data);
882 }
883
884 static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
885                          unsigned offset, int write, char *buf, size_t len)
886 {
887         char *out = buf, *end = buf + len;
888
889         out += scnprintf(out, end - out,
890                          "error validating btree node %s "
891                          "at btree %u level %u/%u\n"
892                          "pos %llu:%llu node offset %u",
893                          write ? "before write " : "",
894                          b->btree_id, b->level,
895                          c->btree_roots[b->btree_id].level,
896                          b->key.k.p.inode, b->key.k.p.offset,
897                          b->written);
898         if (i)
899                 out += scnprintf(out, end - out,
900                                  " bset u64s %u",
901                                  le16_to_cpu(i->u64s));
902
903         return out - buf;
904 }
905
906 enum btree_err_type {
907         BTREE_ERR_FIXABLE,
908         BTREE_ERR_WANT_RETRY,
909         BTREE_ERR_MUST_RETRY,
910         BTREE_ERR_FATAL,
911 };
912
913 enum btree_validate_ret {
914         BTREE_RETRY_READ = 64,
915 };
916
917 #define btree_err(type, c, b, i, msg, ...)                              \
918 ({                                                                      \
919         char buf[200], *out = buf, *end = out + sizeof(buf);            \
920                                                                         \
921         out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
922         out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);      \
923                                                                         \
924         if (type == BTREE_ERR_FIXABLE &&                                \
925             write == READ &&                                            \
926             !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
927                 mustfix_fsck_err(c, "%s", buf);                         \
928         } else {                                                        \
929                 bch_err(c, "%s", buf);                                  \
930                                                                         \
931                 switch (type) {                                         \
932                 case BTREE_ERR_FIXABLE:                                 \
933                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
934                         goto fsck_err;                                  \
935                 case BTREE_ERR_WANT_RETRY:                              \
936                         if (have_retry) {                               \
937                                 ret = BTREE_RETRY_READ;                 \
938                                 goto fsck_err;                          \
939                         }                                               \
940                         break;                                          \
941                 case BTREE_ERR_MUST_RETRY:                              \
942                         ret = BTREE_RETRY_READ;                         \
943                         goto fsck_err;                                  \
944                 case BTREE_ERR_FATAL:                                   \
945                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
946                         goto fsck_err;                                  \
947                 }                                                       \
948         }                                                               \
949         true;                                                           \
950 })
951
952 #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
953
954 static int validate_bset(struct bch_fs *c, struct btree *b,
955                          struct bset *i, unsigned sectors,
956                          unsigned *whiteout_u64s, int write,
957                          bool have_retry)
958 {
959         struct bkey_packed *k, *prev = NULL;
960         struct bpos prev_pos = POS_MIN;
961         bool seen_non_whiteout = false;
962         const char *err;
963         int ret = 0;
964
965         if (i == &b->data->keys) {
966                 /* These indicate that we read the wrong btree node: */
967                 btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
968                              BTREE_ERR_MUST_RETRY, c, b, i,
969                              "incorrect btree id");
970
971                 btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
972                              BTREE_ERR_MUST_RETRY, c, b, i,
973                              "incorrect level");
974
975                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
976                         u64 *p = (u64 *) &b->data->ptr;
977
978                         *p = swab64(*p);
979                         bch2_bpos_swab(&b->data->min_key);
980                         bch2_bpos_swab(&b->data->max_key);
981                 }
982
983                 btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
984                              BTREE_ERR_MUST_RETRY, c, b, i,
985                              "incorrect max key");
986
987                 /* XXX: ideally we would be validating min_key too */
988 #if 0
989                 /*
990                  * not correct anymore, due to btree node write error
991                  * handling
992                  *
993                  * need to add b->data->seq to btree keys and verify
994                  * against that
995                  */
996                 btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
997                                                   b->data->ptr),
998                              BTREE_ERR_FATAL, c, b, i,
999                              "incorrect backpointer");
1000 #endif
1001                 err = bch2_bkey_format_validate(&b->data->format);
1002                 btree_err_on(err,
1003                              BTREE_ERR_FATAL, c, b, i,
1004                              "invalid bkey format: %s", err);
1005         }
1006
1007         if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
1008                          BTREE_ERR_FIXABLE, c, b, i,
1009                          "unsupported bset version")) {
1010                 i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1011                 i->u64s = 0;
1012                 return 0;
1013         }
1014
1015         if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
1016                          BTREE_ERR_FIXABLE, c, b, i,
1017                          "bset past end of btree node")) {
1018                 i->u64s = 0;
1019                 return 0;
1020         }
1021
1022         btree_err_on(b->written && !i->u64s,
1023                      BTREE_ERR_FIXABLE, c, b, i,
1024                      "empty bset");
1025
1026         if (!BSET_SEPARATE_WHITEOUTS(i)) {
1027                 seen_non_whiteout = true;
1028                 whiteout_u64s = 0;
1029         }
1030
1031         for (k = i->start;
1032              k != vstruct_last(i);) {
1033                 struct bkey_s_c u;
1034                 struct bkey tmp;
1035                 const char *invalid;
1036
1037                 if (btree_err_on(!k->u64s,
1038                                  BTREE_ERR_FIXABLE, c, b, i,
1039                                  "KEY_U64s 0: %zu bytes of metadata lost",
1040                                  vstruct_end(i) - (void *) k)) {
1041                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1042                         break;
1043                 }
1044
1045                 if (btree_err_on(bkey_next(k) > vstruct_last(i),
1046                                  BTREE_ERR_FIXABLE, c, b, i,
1047                                  "key extends past end of bset")) {
1048                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1049                         break;
1050                 }
1051
1052                 if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
1053                                  BTREE_ERR_FIXABLE, c, b, i,
1054                                  "invalid bkey format %u", k->format)) {
1055                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1056                         memmove_u64s_down(k, bkey_next(k),
1057                                           (u64 *) vstruct_end(i) - (u64 *) k);
1058                         continue;
1059                 }
1060
1061                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
1062                         bch2_bkey_swab(btree_node_type(b), &b->format, k);
1063
1064                 u = bkey_disassemble(b, k, &tmp);
1065
1066                 invalid = bch2_btree_bkey_invalid(c, b, u);
1067                 if (invalid) {
1068                         char buf[160];
1069
1070                         bch2_bkey_val_to_text(c, btree_node_type(b),
1071                                               buf, sizeof(buf), u);
1072                         btree_err(BTREE_ERR_FIXABLE, c, b, i,
1073                                   "invalid bkey %s: %s", buf, invalid);
1074
1075                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1076                         memmove_u64s_down(k, bkey_next(k),
1077                                           (u64 *) vstruct_end(i) - (u64 *) k);
1078                         continue;
1079                 }
1080
1081                 /*
1082                  * with the separate whiteouts thing (used for extents), the
1083                  * second set of keys actually can have whiteouts too, so we
1084                  * can't solely go off bkey_whiteout()...
1085                  */
1086
1087                 if (!seen_non_whiteout &&
1088                     (!bkey_whiteout(k) ||
1089                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
1090                         *whiteout_u64s = k->_data - i->_data;
1091                         seen_non_whiteout = true;
1092                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
1093                         btree_err(BTREE_ERR_FATAL, c, b, i,
1094                                   "keys out of order: %llu:%llu > %llu:%llu",
1095                                   prev_pos.inode,
1096                                   prev_pos.offset,
1097                                   u.k->p.inode,
1098                                   bkey_start_offset(u.k));
1099                         /* XXX: repair this */
1100                 }
1101
1102                 prev_pos = u.k->p;
1103                 prev = k;
1104                 k = bkey_next(k);
1105         }
1106
1107         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1108 fsck_err:
1109         return ret;
1110 }
1111
1112 int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
1113 {
1114         struct btree_node_entry *bne;
1115         struct btree_node_iter *iter;
1116         struct btree_node *sorted;
1117         bool used_mempool;
1118         unsigned u64s;
1119         int ret, retry_read = 0, write = READ;
1120
1121         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1122         __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
1123
1124         if (bch2_meta_read_fault("btree"))
1125                 btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
1126                           "dynamic fault");
1127
1128         btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
1129                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1130                      "bad magic");
1131
1132         btree_err_on(!b->data->keys.seq,
1133                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1134                      "bad btree header");
1135
1136         while (b->written < c->opts.btree_node_size) {
1137                 unsigned sectors, whiteout_u64s = 0;
1138                 struct nonce nonce;
1139                 struct bch_csum csum;
1140                 struct bset *i;
1141
1142                 if (!b->written) {
1143                         i = &b->data->keys;
1144
1145                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1146                                      BTREE_ERR_WANT_RETRY, c, b, i,
1147                                      "unknown checksum type");
1148
1149                         nonce = btree_nonce(i, b->written << 9);
1150                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1151
1152                         btree_err_on(bch2_crc_cmp(csum, b->data->csum),
1153                                      BTREE_ERR_WANT_RETRY, c, b, i,
1154                                      "invalid checksum");
1155
1156                         bset_encrypt(c, i, b->written << 9);
1157
1158                         sectors = vstruct_sectors(b->data, c->block_bits);
1159
1160                         set_btree_bset(b, b->set, &b->data->keys);
1161                         btree_node_set_format(b, b->data->format);
1162                 } else {
1163                         bne = write_block(b);
1164                         i = &bne->keys;
1165
1166                         if (i->seq != b->data->keys.seq)
1167                                 break;
1168
1169                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1170                                      BTREE_ERR_WANT_RETRY, c, b, i,
1171                                      "unknown checksum type");
1172
1173                         nonce = btree_nonce(i, b->written << 9);
1174                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1175
1176                         btree_err_on(bch2_crc_cmp(csum, bne->csum),
1177                                      BTREE_ERR_WANT_RETRY, c, b, i,
1178                                      "invalid checksum");
1179
1180                         bset_encrypt(c, i, b->written << 9);
1181
1182                         sectors = vstruct_sectors(bne, c->block_bits);
1183                 }
1184
1185                 ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
1186                                     READ, have_retry);
1187                 if (ret)
1188                         goto fsck_err;
1189
1190                 b->written += sectors;
1191
1192                 ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1193                 if (ret < 0) {
1194                         btree_err(BTREE_ERR_FATAL, c, b, i,
1195                                   "insufficient memory");
1196                         goto err;
1197                 }
1198
1199                 if (ret) {
1200                         btree_err_on(!b->written,
1201                                      BTREE_ERR_FIXABLE, c, b, i,
1202                                      "first btree node bset has blacklisted journal seq");
1203                         if (b->written)
1204                                 continue;
1205                 }
1206
1207                 __bch2_btree_node_iter_push(iter, b,
1208                                            i->start,
1209                                            vstruct_idx(i, whiteout_u64s));
1210
1211                 __bch2_btree_node_iter_push(iter, b,
1212                                            vstruct_idx(i, whiteout_u64s),
1213                                            vstruct_last(i));
1214         }
1215
1216         for (bne = write_block(b);
1217              bset_byte_offset(b, bne) < btree_bytes(c);
1218              bne = (void *) bne + block_bytes(c))
1219                 btree_err_on(bne->keys.seq == b->data->keys.seq,
1220                              BTREE_ERR_WANT_RETRY, c, b, NULL,
1221                              "found bset signature after last bset");
1222
1223         sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
1224         sorted->keys.u64s = 0;
1225
1226         b->nr = btree_node_is_extents(b)
1227                 ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1228                 : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
1229
1230         u64s = le16_to_cpu(sorted->keys.u64s);
1231         *sorted = *b->data;
1232         sorted->keys.u64s = cpu_to_le16(u64s);
1233         swap(sorted, b->data);
1234         set_btree_bset(b, b->set, &b->data->keys);
1235         b->nsets = 1;
1236
1237         BUG_ON(b->nr.live_u64s != u64s);
1238
1239         btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
1240
1241         bch2_bset_build_aux_tree(b, b->set, false);
1242
1243         set_needs_whiteout(btree_bset_first(b));
1244
1245         btree_node_reset_sib_u64s(b);
1246 out:
1247         mempool_free(iter, &c->fill_iter);
1248         return retry_read;
1249 err:
1250 fsck_err:
1251         if (ret == BTREE_RETRY_READ) {
1252                 retry_read = 1;
1253         } else {
1254                 bch2_inconsistent_error(c);
1255                 set_btree_node_read_error(b);
1256         }
1257         goto out;
1258 }
1259
1260 static void btree_node_read_work(struct work_struct *work)
1261 {
1262         struct btree_read_bio *rb =
1263                 container_of(work, struct btree_read_bio, work);
1264         struct bch_fs *c        = rb->c;
1265         struct btree *b         = rb->bio.bi_private;
1266         struct bio *bio         = &rb->bio;
1267         struct bch_devs_mask avoid;
1268
1269         memset(&avoid, 0, sizeof(avoid));
1270
1271         goto start;
1272         do {
1273                 bch_info(c, "retrying read");
1274                 bio_reset(bio);
1275                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1276                 bio->bi_bdev            = rb->pick.ca->disk_sb.bdev;
1277                 bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
1278                 bio->bi_iter.bi_size    = btree_bytes(c);
1279                 submit_bio_wait(bio);
1280 start:
1281                 bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
1282                 percpu_ref_put(&rb->pick.ca->io_ref);
1283
1284                 __set_bit(rb->pick.ca->dev_idx, avoid.d);
1285                 rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
1286
1287                 if (!bio->bi_error &&
1288                     !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
1289                         goto out;
1290         } while (!IS_ERR_OR_NULL(rb->pick.ca));
1291
1292         set_btree_node_read_error(b);
1293 out:
1294         if (!IS_ERR_OR_NULL(rb->pick.ca))
1295                 percpu_ref_put(&rb->pick.ca->io_ref);
1296
1297         bch2_time_stats_update(&c->btree_read_time, rb->start_time);
1298         bio_put(&rb->bio);
1299         clear_btree_node_read_in_flight(b);
1300         wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1301 }
1302
1303 static void btree_node_read_endio(struct bio *bio)
1304 {
1305         struct btree_read_bio *rb =
1306                 container_of(bio, struct btree_read_bio, bio);
1307
1308         bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
1309
1310         INIT_WORK(&rb->work, btree_node_read_work);
1311         schedule_work(&rb->work);
1312 }
1313
1314 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
1315                           bool sync)
1316 {
1317         struct extent_pick_ptr pick;
1318         struct btree_read_bio *rb;
1319         struct bio *bio;
1320
1321         trace_btree_read(c, b);
1322
1323         pick = bch2_btree_pick_ptr(c, b, NULL);
1324         if (bch2_fs_fatal_err_on(!pick.ca, c,
1325                         "btree node read error: no device to read from")) {
1326                 set_btree_node_read_error(b);
1327                 return;
1328         }
1329
1330         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
1331         rb = container_of(bio, struct btree_read_bio, bio);
1332         rb->c                   = c;
1333         rb->start_time          = local_clock();
1334         rb->pick                = pick;
1335         bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1336         bio->bi_bdev            = pick.ca->disk_sb.bdev;
1337         bio->bi_iter.bi_sector  = pick.ptr.offset;
1338         bio->bi_iter.bi_size    = btree_bytes(c);
1339         bch2_bio_map(bio, b->data);
1340
1341         this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
1342                      bio_sectors(bio));
1343
1344         set_btree_node_read_in_flight(b);
1345
1346         if (sync) {
1347                 submit_bio_wait(bio);
1348                 bio->bi_private = b;
1349                 btree_node_read_work(&rb->work);
1350         } else {
1351                 bio->bi_end_io  = btree_node_read_endio;
1352                 bio->bi_private = b;
1353                 submit_bio(bio);
1354         }
1355 }
1356
1357 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1358                         const struct bkey_i *k, unsigned level)
1359 {
1360         struct closure cl;
1361         struct btree *b;
1362         int ret;
1363
1364         closure_init_stack(&cl);
1365
1366         do {
1367                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1368                 closure_sync(&cl);
1369         } while (ret);
1370
1371         b = bch2_btree_node_mem_alloc(c);
1372         bch2_btree_cache_cannibalize_unlock(c);
1373
1374         BUG_ON(IS_ERR(b));
1375
1376         bkey_copy(&b->key, k);
1377         BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
1378
1379         bch2_btree_node_read(c, b, true);
1380         six_unlock_write(&b->lock);
1381
1382         if (btree_node_read_error(b)) {
1383                 six_unlock_intent(&b->lock);
1384                 return -EIO;
1385         }
1386
1387         bch2_btree_set_root_for_read(c, b);
1388         six_unlock_intent(&b->lock);
1389
1390         return 0;
1391 }
1392
1393 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1394                               struct btree_write *w)
1395 {
1396         bch2_journal_pin_drop(&c->journal, &w->journal);
1397         closure_wake_up(&w->wait);
1398 }
1399
1400 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1401 {
1402         struct btree_write *w = btree_prev_write(b);
1403
1404         bch2_btree_complete_write(c, b, w);
1405         btree_node_io_unlock(b);
1406 }
1407
1408 static void bch2_btree_node_write_error(struct bch_fs *c,
1409                                         struct bch_write_bio *wbio)
1410 {
1411         struct btree *b         = wbio->bio.bi_private;
1412         struct closure *cl      = wbio->cl;
1413         __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1414         struct bkey_i_extent *new_key;
1415
1416         six_lock_read(&b->lock);
1417         bkey_copy(&tmp.k, &b->key);
1418         six_unlock_read(&b->lock);
1419
1420         if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
1421                 /* Node has been freed: */
1422                 goto out;
1423         }
1424
1425         new_key = bkey_i_to_extent(&tmp.k);
1426
1427         while (wbio->replicas_failed) {
1428                 unsigned idx = __fls(wbio->replicas_failed);
1429
1430                 bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
1431                 wbio->replicas_failed ^= 1 << idx;
1432         }
1433
1434         if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
1435             bch2_btree_node_update_key(c, b, new_key)) {
1436                 set_btree_node_noevict(b);
1437                 bch2_fatal_error(c);
1438         }
1439 out:
1440         bio_put(&wbio->bio);
1441         btree_node_write_done(c, b);
1442         if (cl)
1443                 closure_put(cl);
1444 }
1445
1446 void bch2_btree_write_error_work(struct work_struct *work)
1447 {
1448         struct bch_fs *c = container_of(work, struct bch_fs,
1449                                         btree_write_error_work);
1450         struct bio *bio;
1451
1452         while (1) {
1453                 spin_lock_irq(&c->btree_write_error_lock);
1454                 bio = bio_list_pop(&c->btree_write_error_list);
1455                 spin_unlock_irq(&c->btree_write_error_lock);
1456
1457                 if (!bio)
1458                         break;
1459
1460                 bch2_btree_node_write_error(c, to_wbio(bio));
1461         }
1462 }
1463
1464 static void btree_node_write_endio(struct bio *bio)
1465 {
1466         struct btree *b                 = bio->bi_private;
1467         struct bch_write_bio *wbio      = to_wbio(bio);
1468         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
1469         struct bch_write_bio *orig      = parent ?: wbio;
1470         struct closure *cl              = !wbio->split ? wbio->cl : NULL;
1471         struct bch_fs *c                = wbio->c;
1472         struct bch_dev *ca              = wbio->ca;
1473
1474         bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
1475
1476         if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
1477             bch2_meta_write_fault("btree"))
1478                 set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
1479
1480         if (wbio->have_io_ref)
1481                 percpu_ref_put(&ca->io_ref);
1482
1483         if (parent) {
1484                 bio_put(bio);
1485                 bio_endio(&parent->bio);
1486                 return;
1487         }
1488
1489         btree_bounce_free(c,
1490                 wbio->order,
1491                 wbio->used_mempool,
1492                 wbio->data);
1493
1494         if (wbio->replicas_failed) {
1495                 unsigned long flags;
1496
1497                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1498                 bio_list_add(&c->btree_write_error_list, &wbio->bio);
1499                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1500                 queue_work(c->wq, &c->btree_write_error_work);
1501                 return;
1502         }
1503
1504         bio_put(bio);
1505         btree_node_write_done(c, b);
1506         if (cl)
1507                 closure_put(cl);
1508 }
1509
1510 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1511                                    struct bset *i, unsigned sectors)
1512 {
1513         const struct bch_extent_ptr *ptr;
1514         unsigned whiteout_u64s = 0;
1515         int ret;
1516
1517         extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
1518                 break;
1519
1520         ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
1521         if (ret)
1522                 bch2_inconsistent_error(c);
1523
1524         return ret;
1525 }
1526
1527 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1528                             struct closure *parent,
1529                             enum six_lock_type lock_type_held)
1530 {
1531         struct bch_write_bio *wbio;
1532         struct bset_tree *t;
1533         struct bset *i;
1534         struct btree_node *bn = NULL;
1535         struct btree_node_entry *bne = NULL;
1536         BKEY_PADDED(key) k;
1537         struct bkey_s_extent e;
1538         struct bch_extent_ptr *ptr;
1539         struct sort_iter sort_iter;
1540         struct nonce nonce;
1541         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1542         u64 seq = 0;
1543         bool used_mempool;
1544         unsigned long old, new;
1545         void *data;
1546
1547         /*
1548          * We may only have a read lock on the btree node - the dirty bit is our
1549          * "lock" against racing with other threads that may be trying to start
1550          * a write, we do a write iff we clear the dirty bit. Since setting the
1551          * dirty bit requires a write lock, we can't race with other threads
1552          * redirtying it:
1553          */
1554         do {
1555                 old = new = READ_ONCE(b->flags);
1556
1557                 if (!(old & (1 << BTREE_NODE_dirty)))
1558                         return;
1559
1560                 if (b->written &&
1561                     !btree_node_may_write(b))
1562                         return;
1563
1564                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1565                         btree_node_wait_on_io(b);
1566                         continue;
1567                 }
1568
1569                 new &= ~(1 << BTREE_NODE_dirty);
1570                 new &= ~(1 << BTREE_NODE_need_write);
1571                 new |=  (1 << BTREE_NODE_write_in_flight);
1572                 new |=  (1 << BTREE_NODE_just_written);
1573                 new ^=  (1 << BTREE_NODE_write_idx);
1574         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1575
1576         BUG_ON(!list_empty(&b->write_blocked));
1577         BUG_ON((b->will_make_reachable != NULL) != !b->written);
1578
1579         BUG_ON(b->written >= c->opts.btree_node_size);
1580         BUG_ON(bset_written(b, btree_bset_last(b)));
1581         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1582         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1583
1584         if (lock_type_held == SIX_LOCK_intent) {
1585                 six_lock_write(&b->lock);
1586                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
1587                 six_unlock_write(&b->lock);
1588         } else {
1589                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1590         }
1591
1592         BUG_ON(b->uncompacted_whiteout_u64s);
1593
1594         sort_iter_init(&sort_iter, b);
1595
1596         bytes = !b->written
1597                 ? sizeof(struct btree_node)
1598                 : sizeof(struct btree_node_entry);
1599
1600         bytes += b->whiteout_u64s * sizeof(u64);
1601
1602         for_each_bset(b, t) {
1603                 i = bset(b, t);
1604
1605                 if (bset_written(b, i))
1606                         continue;
1607
1608                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1609                 sort_iter_add(&sort_iter,
1610                               btree_bkey_first(b, t),
1611                               btree_bkey_last(b, t));
1612                 seq = max(seq, le64_to_cpu(i->journal_seq));
1613         }
1614
1615         order = get_order(bytes);
1616         data = btree_bounce_alloc(c, order, &used_mempool);
1617
1618         if (!b->written) {
1619                 bn = data;
1620                 *bn = *b->data;
1621                 i = &bn->keys;
1622         } else {
1623                 bne = data;
1624                 bne->keys = b->data->keys;
1625                 i = &bne->keys;
1626         }
1627
1628         i->journal_seq  = cpu_to_le64(seq);
1629         i->u64s         = 0;
1630
1631         if (!btree_node_is_extents(b)) {
1632                 sort_iter_add(&sort_iter,
1633                               unwritten_whiteouts_start(c, b),
1634                               unwritten_whiteouts_end(c, b));
1635                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1636         } else {
1637                 memcpy_u64s(i->start,
1638                             unwritten_whiteouts_start(c, b),
1639                             b->whiteout_u64s);
1640                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1641                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1642         }
1643
1644         b->whiteout_u64s = 0;
1645
1646         u64s = btree_node_is_extents(b)
1647                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1648                 : sort_keys(i->start, &sort_iter, false);
1649         le16_add_cpu(&i->u64s, u64s);
1650
1651         clear_needs_whiteout(i);
1652
1653         /* do we have data to write? */
1654         if (b->written && !i->u64s)
1655                 goto nowrite;
1656
1657         bytes_to_write = vstruct_end(i) - data;
1658         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1659
1660         memset(data + bytes_to_write, 0,
1661                (sectors_to_write << 9) - bytes_to_write);
1662
1663         BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
1664         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1665         BUG_ON(i->seq != b->data->keys.seq);
1666
1667         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1668         SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
1669
1670         /* if we're going to be encrypting, check metadata validity first: */
1671         if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1672             validate_bset_for_write(c, b, i, sectors_to_write))
1673                 goto err;
1674
1675         bset_encrypt(c, i, b->written << 9);
1676
1677         nonce = btree_nonce(i, b->written << 9);
1678
1679         if (bn)
1680                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1681         else
1682                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1683
1684         /* if we're not encrypting, check metadata after checksumming: */
1685         if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1686             validate_bset_for_write(c, b, i, sectors_to_write))
1687                 goto err;
1688
1689         /*
1690          * We handle btree write errors by immediately halting the journal -
1691          * after we've done that, we can't issue any subsequent btree writes
1692          * because they might have pointers to new nodes that failed to write.
1693          *
1694          * Furthermore, there's no point in doing any more btree writes because
1695          * with the journal stopped, we're never going to update the journal to
1696          * reflect that those writes were done and the data flushed from the
1697          * journal:
1698          *
1699          * Make sure to update b->written so bch2_btree_init_next() doesn't
1700          * break:
1701          */
1702         if (bch2_journal_error(&c->journal) ||
1703             c->opts.nochanges)
1704                 goto err;
1705
1706         trace_btree_write(b, bytes_to_write, sectors_to_write);
1707
1708         wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
1709         wbio->cl                = parent;
1710         wbio->order             = order;
1711         wbio->used_mempool      = used_mempool;
1712         wbio->data              = data;
1713         wbio->bio.bi_opf        = REQ_OP_WRITE|REQ_META|REQ_FUA;
1714         wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
1715         wbio->bio.bi_end_io     = btree_node_write_endio;
1716         wbio->bio.bi_private    = b;
1717
1718         if (parent)
1719                 closure_get(parent);
1720
1721         bch2_bio_map(&wbio->bio, data);
1722
1723         /*
1724          * If we're appending to a leaf node, we don't technically need FUA -
1725          * this write just needs to be persisted before the next journal write,
1726          * which will be marked FLUSH|FUA.
1727          *
1728          * Similarly if we're writing a new btree root - the pointer is going to
1729          * be in the next journal entry.
1730          *
1731          * But if we're writing a new btree node (that isn't a root) or
1732          * appending to a non leaf btree node, we need either FUA or a flush
1733          * when we write the parent with the new pointer. FUA is cheaper than a
1734          * flush, and writes appending to leaf nodes aren't blocking anything so
1735          * just make all btree node writes FUA to keep things sane.
1736          */
1737
1738         bkey_copy(&k.key, &b->key);
1739         e = bkey_i_to_s_extent(&k.key);
1740
1741         extent_for_each_ptr(e, ptr)
1742                 ptr->offset += b->written;
1743
1744         b->written += sectors_to_write;
1745
1746         bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
1747         return;
1748 err:
1749         set_btree_node_noevict(b);
1750         b->written += sectors_to_write;
1751 nowrite:
1752         btree_bounce_free(c, order, used_mempool, data);
1753         btree_node_write_done(c, b);
1754 }
1755
1756 /*
1757  * Work that must be done with write lock held:
1758  */
1759 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
1760 {
1761         bool invalidated_iter = false;
1762         struct btree_node_entry *bne;
1763         struct bset_tree *t;
1764
1765         if (!btree_node_just_written(b))
1766                 return false;
1767
1768         BUG_ON(b->whiteout_u64s);
1769         BUG_ON(b->uncompacted_whiteout_u64s);
1770
1771         clear_btree_node_just_written(b);
1772
1773         /*
1774          * Note: immediately after write, bset_unwritten()/bset_written() don't
1775          * work - the amount of data we had to write after compaction might have
1776          * been smaller than the offset of the last bset.
1777          *
1778          * However, we know that all bsets have been written here, as long as
1779          * we're still holding the write lock:
1780          */
1781
1782         /*
1783          * XXX: decide if we really want to unconditionally sort down to a
1784          * single bset:
1785          */
1786         if (b->nsets > 1) {
1787                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1788                 invalidated_iter = true;
1789         } else {
1790                 invalidated_iter = bch2_drop_whiteouts(b);
1791         }
1792
1793         for_each_bset(b, t)
1794                 set_needs_whiteout(bset(b, t));
1795
1796         bch2_btree_verify(c, b);
1797
1798         /*
1799          * If later we don't unconditionally sort down to a single bset, we have
1800          * to ensure this is still true:
1801          */
1802         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1803
1804         bne = want_new_bset(c, b);
1805         if (bne)
1806                 bch2_bset_init_next(b, &bne->keys);
1807
1808         bch2_btree_build_aux_trees(b);
1809
1810         return invalidated_iter;
1811 }
1812
1813 /*
1814  * Use this one if the node is intent locked:
1815  */
1816 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1817                           struct closure *parent,
1818                           enum six_lock_type lock_type_held)
1819 {
1820         BUG_ON(lock_type_held == SIX_LOCK_write);
1821
1822         if (lock_type_held == SIX_LOCK_intent ||
1823             six_trylock_convert(&b->lock, SIX_LOCK_read,
1824                                 SIX_LOCK_intent)) {
1825                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
1826
1827                 /* don't cycle lock unnecessarily: */
1828                 if (btree_node_just_written(b)) {
1829                         six_lock_write(&b->lock);
1830                         bch2_btree_post_write_cleanup(c, b);
1831                         six_unlock_write(&b->lock);
1832                 }
1833
1834                 if (lock_type_held == SIX_LOCK_read)
1835                         six_lock_downgrade(&b->lock);
1836         } else {
1837                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
1838         }
1839 }
1840
1841 void bch2_btree_verify_flushed(struct bch_fs *c)
1842 {
1843         struct bucket_table *tbl;
1844         struct rhash_head *pos;
1845         struct btree *b;
1846         unsigned i;
1847
1848         rcu_read_lock();
1849         tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
1850                                   &c->btree_cache.table);
1851
1852         for (i = 0; i < tbl->size; i++)
1853                 rht_for_each_entry_rcu(b, pos, tbl, i, hash)
1854                         BUG_ON(btree_node_dirty(b));
1855         rcu_read_unlock();
1856 }