]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_io.c
Update bcachefs sources to e82e656279 bcachefs: Cleanups for building in userspace
[bcachefs-tools-debian] / libbcachefs / btree_io.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_iter.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "buckets.h"
11 #include "checksum.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "super-io.h"
18
19 #include <trace/events/bcachefs.h>
20
21 static void verify_no_dups(struct btree *b,
22                            struct bkey_packed *start,
23                            struct bkey_packed *end)
24 {
25 #ifdef CONFIG_BCACHEFS_DEBUG
26         struct bkey_packed *k;
27
28         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
29                 struct bkey l = bkey_unpack_key(b, k);
30                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
31
32                 BUG_ON(btree_node_is_extents(b)
33                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
34                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
35                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
36         }
37 #endif
38 }
39
40 static void clear_needs_whiteout(struct bset *i)
41 {
42         struct bkey_packed *k;
43
44         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
45                 k->needs_whiteout = false;
46 }
47
48 static void set_needs_whiteout(struct bset *i)
49 {
50         struct bkey_packed *k;
51
52         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
53                 k->needs_whiteout = true;
54 }
55
56 static void btree_bounce_free(struct bch_fs *c, unsigned order,
57                               bool used_mempool, void *p)
58 {
59         if (used_mempool)
60                 mempool_free(p, &c->btree_bounce_pool);
61         else
62                 vpfree(p, PAGE_SIZE << order);
63 }
64
65 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
66                                 bool *used_mempool)
67 {
68         void *p;
69
70         BUG_ON(order > btree_page_order(c));
71
72         *used_mempool = false;
73         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
74         if (p)
75                 return p;
76
77         *used_mempool = true;
78         return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
79 }
80
81 typedef int (*sort_cmp_fn)(struct btree *,
82                            struct bkey_packed *,
83                            struct bkey_packed *);
84
85 struct sort_iter {
86         struct btree    *b;
87         unsigned                used;
88
89         struct sort_iter_set {
90                 struct bkey_packed *k, *end;
91         } data[MAX_BSETS + 1];
92 };
93
94 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
95 {
96         memset(iter, 0, sizeof(*iter));
97         iter->b = b;
98 }
99
100 static inline void __sort_iter_sift(struct sort_iter *iter,
101                                     unsigned from,
102                                     sort_cmp_fn cmp)
103 {
104         unsigned i;
105
106         for (i = from;
107              i + 1 < iter->used &&
108              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
109              i++)
110                 swap(iter->data[i], iter->data[i + 1]);
111 }
112
113 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
114 {
115
116         __sort_iter_sift(iter, 0, cmp);
117 }
118
119 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
120 {
121         unsigned i = iter->used;
122
123         while (i--)
124                 __sort_iter_sift(iter, i, cmp);
125 }
126
127 static void sort_iter_add(struct sort_iter *iter,
128                           struct bkey_packed *k,
129                           struct bkey_packed *end)
130 {
131         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
132
133         if (k != end)
134                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
135 }
136
137 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
138 {
139         return iter->used ? iter->data->k : NULL;
140 }
141
142 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
143 {
144         iter->data->k = bkey_next(iter->data->k);
145
146         BUG_ON(iter->data->k > iter->data->end);
147
148         if (iter->data->k == iter->data->end)
149                 memmove(&iter->data[0],
150                         &iter->data[1],
151                         sizeof(iter->data[0]) * --iter->used);
152         else
153                 sort_iter_sift(iter, cmp);
154 }
155
156 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
157                                                  sort_cmp_fn cmp)
158 {
159         struct bkey_packed *ret = sort_iter_peek(iter);
160
161         if (ret)
162                 sort_iter_advance(iter, cmp);
163
164         return ret;
165 }
166
167 static inline int sort_key_whiteouts_cmp(struct btree *b,
168                                          struct bkey_packed *l,
169                                          struct bkey_packed *r)
170 {
171         return bkey_cmp_packed(b, l, r);
172 }
173
174 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
175                                    struct sort_iter *iter)
176 {
177         struct bkey_packed *in, *out = dst;
178
179         sort_iter_sort(iter, sort_key_whiteouts_cmp);
180
181         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
182                 bkey_copy(out, in);
183                 out = bkey_next(out);
184         }
185
186         return (u64 *) out - (u64 *) dst;
187 }
188
189 static inline int sort_extent_whiteouts_cmp(struct btree *b,
190                                             struct bkey_packed *l,
191                                             struct bkey_packed *r)
192 {
193         struct bkey ul = bkey_unpack_key(b, l);
194         struct bkey ur = bkey_unpack_key(b, r);
195
196         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
197 }
198
199 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
200                                       struct sort_iter *iter)
201 {
202         const struct bkey_format *f = &iter->b->format;
203         struct bkey_packed *in, *out = dst;
204         struct bkey_i l, r;
205         bool prev = false, l_packed = false;
206         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
207         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
208         u64 new_size;
209
210         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
211
212         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
213
214         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
215                 EBUG_ON(bkeyp_val_u64s(f, in));
216                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
217
218                 r.k = bkey_unpack_key(iter->b, in);
219
220                 if (prev &&
221                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
222                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
223                                 continue;
224
225                         new_size = l_packed
226                                 ? min(max_packed_size, max_packed_offset -
227                                       bkey_start_offset(&l.k))
228                                 : KEY_SIZE_MAX;
229
230                         new_size = min(new_size, r.k.p.offset -
231                                        bkey_start_offset(&l.k));
232
233                         BUG_ON(new_size < l.k.size);
234
235                         bch2_key_resize(&l.k, new_size);
236
237                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
238                                 continue;
239
240                         bch2_cut_front(l.k.p, &r);
241                 }
242
243                 if (prev) {
244                         if (!bch2_bkey_pack(out, &l, f)) {
245                                 BUG_ON(l_packed);
246                                 bkey_copy(out, &l);
247                         }
248                         out = bkey_next(out);
249                 }
250
251                 l = r;
252                 prev = true;
253                 l_packed = bkey_packed(in);
254         }
255
256         if (prev) {
257                 if (!bch2_bkey_pack(out, &l, f)) {
258                         BUG_ON(l_packed);
259                         bkey_copy(out, &l);
260                 }
261                 out = bkey_next(out);
262         }
263
264         return (u64 *) out - (u64 *) dst;
265 }
266
267 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
268                                     bool compacting,
269                                     enum compact_mode mode)
270 {
271         unsigned live_u64s = b->nr.bset_u64s[t - b->set];
272         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
273
274         if (live_u64s == bset_u64s)
275                 return 0;
276
277         if (mode == COMPACT_LAZY) {
278                 if (live_u64s * 4 < bset_u64s * 3 ||
279                     (compacting && bset_unwritten(b, bset(b, t))))
280                         return bset_u64s - live_u64s;
281         } else {
282                 if (bset_written(b, bset(b, t)))
283                         return bset_u64s - live_u64s;
284         }
285
286         return 0;
287 }
288
289 bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
290                              enum compact_mode mode)
291 {
292         const struct bkey_format *f = &b->format;
293         struct bset_tree *t;
294         struct bkey_packed *whiteouts = NULL;
295         struct bkey_packed *u_start, *u_pos;
296         struct sort_iter sort_iter;
297         unsigned order, whiteout_u64s = 0, u64s;
298         bool used_mempool, compacting = false;
299
300         for_each_bset(b, t)
301                 whiteout_u64s += should_compact_bset(b, t,
302                                         whiteout_u64s != 0, mode);
303
304         if (!whiteout_u64s)
305                 return false;
306
307         sort_iter_init(&sort_iter, b);
308
309         whiteout_u64s += b->whiteout_u64s;
310         order = get_order(whiteout_u64s * sizeof(u64));
311
312         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
313         u_start = u_pos = whiteouts;
314
315         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
316                     b->whiteout_u64s);
317         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
318
319         sort_iter_add(&sort_iter, u_start, u_pos);
320
321         for_each_bset(b, t) {
322                 struct bset *i = bset(b, t);
323                 struct bkey_packed *k, *n, *out, *start, *end;
324                 struct btree_node_entry *src = NULL, *dst = NULL;
325
326                 if (t != b->set && bset_unwritten(b, i)) {
327                         src = container_of(i, struct btree_node_entry, keys);
328                         dst = max(write_block(b),
329                                   (void *) btree_bkey_last(b, t -1));
330                 }
331
332                 if (!should_compact_bset(b, t, compacting, mode)) {
333                         if (src != dst) {
334                                 memmove(dst, src, sizeof(*src) +
335                                         le16_to_cpu(src->keys.u64s) *
336                                         sizeof(u64));
337                                 i = &dst->keys;
338                                 set_btree_bset(b, t, i);
339                         }
340                         continue;
341                 }
342
343                 compacting = true;
344                 u_start = u_pos;
345                 start = i->start;
346                 end = vstruct_last(i);
347
348                 if (src != dst) {
349                         memmove(dst, src, sizeof(*src));
350                         i = &dst->keys;
351                         set_btree_bset(b, t, i);
352                 }
353
354                 out = i->start;
355
356                 for (k = start; k != end; k = n) {
357                         n = bkey_next(k);
358
359                         if (bkey_deleted(k) && btree_node_is_extents(b))
360                                 continue;
361
362                         if (bkey_whiteout(k) && !k->needs_whiteout)
363                                 continue;
364
365                         if (bkey_whiteout(k)) {
366                                 unreserve_whiteout(b, t, k);
367                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
368                                 set_bkeyp_val_u64s(f, u_pos, 0);
369                                 u_pos = bkey_next(u_pos);
370                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
371                                 bkey_copy(out, k);
372                                 out = bkey_next(out);
373                         }
374                 }
375
376                 sort_iter_add(&sort_iter, u_start, u_pos);
377
378                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
379                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
380                         set_btree_bset_end(b, t);
381                         bch2_bset_set_no_aux_tree(b, t);
382                 }
383         }
384
385         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
386
387         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
388                (void *) btree_bkey_last(b, bset_tree_last(b)));
389
390         u64s = btree_node_is_extents(b)
391                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
392                                         &sort_iter)
393                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
394                                      &sort_iter);
395
396         BUG_ON(u64s > b->whiteout_u64s);
397         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
398         BUG_ON(u_pos != whiteouts && !u64s);
399
400         if (u64s != b->whiteout_u64s) {
401                 void *src = unwritten_whiteouts_start(c, b);
402
403                 b->whiteout_u64s = u64s;
404                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
405         }
406
407         verify_no_dups(b,
408                        unwritten_whiteouts_start(c, b),
409                        unwritten_whiteouts_end(c, b));
410
411         btree_bounce_free(c, order, used_mempool, whiteouts);
412
413         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
414                 bch2_btree_build_aux_trees(b);
415
416         bch_btree_keys_u64s_remaining(c, b);
417         bch2_verify_btree_nr_keys(b);
418
419         return true;
420 }
421
422 static bool bch2_drop_whiteouts(struct btree *b)
423 {
424         struct bset_tree *t;
425         bool ret = false;
426
427         for_each_bset(b, t) {
428                 struct bset *i = bset(b, t);
429                 struct bkey_packed *k, *n, *out, *start, *end;
430
431                 if (!should_compact_bset(b, t, true, true))
432                         continue;
433
434                 start   = btree_bkey_first(b, t);
435                 end     = btree_bkey_last(b, t);
436
437                 if (bset_unwritten(b, i) &&
438                     t != b->set) {
439                         struct bset *dst =
440                                max_t(struct bset *, write_block(b),
441                                      (void *) btree_bkey_last(b, t -1));
442
443                         memmove(dst, i, sizeof(struct bset));
444                         i = dst;
445                         set_btree_bset(b, t, i);
446                 }
447
448                 out = i->start;
449
450                 for (k = start; k != end; k = n) {
451                         n = bkey_next(k);
452
453                         if (!bkey_whiteout(k)) {
454                                 bkey_copy(out, k);
455                                 out = bkey_next(out);
456                         }
457                 }
458
459                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
460                 bch2_bset_set_no_aux_tree(b, t);
461                 ret = true;
462         }
463
464         bch2_verify_btree_nr_keys(b);
465
466         return ret;
467 }
468
469 static inline int sort_keys_cmp(struct btree *b,
470                                 struct bkey_packed *l,
471                                 struct bkey_packed *r)
472 {
473         return bkey_cmp_packed(b, l, r) ?:
474                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
475                 (int) l->needs_whiteout - (int) r->needs_whiteout;
476 }
477
478 static unsigned sort_keys(struct bkey_packed *dst,
479                           struct sort_iter *iter,
480                           bool filter_whiteouts)
481 {
482         const struct bkey_format *f = &iter->b->format;
483         struct bkey_packed *in, *next, *out = dst;
484
485         sort_iter_sort(iter, sort_keys_cmp);
486
487         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
488                 if (bkey_whiteout(in) &&
489                     (filter_whiteouts || !in->needs_whiteout))
490                         continue;
491
492                 if (bkey_whiteout(in) &&
493                     (next = sort_iter_peek(iter)) &&
494                     !bkey_cmp_packed(iter->b, in, next)) {
495                         BUG_ON(in->needs_whiteout &&
496                                next->needs_whiteout);
497                         /*
498                          * XXX racy, called with read lock from write path
499                          *
500                          * leads to spurious BUG_ON() in bkey_unpack_key() in
501                          * debug mode
502                          */
503                         next->needs_whiteout |= in->needs_whiteout;
504                         continue;
505                 }
506
507                 if (bkey_whiteout(in)) {
508                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
509                         set_bkeyp_val_u64s(f, out, 0);
510                 } else {
511                         bkey_copy(out, in);
512                 }
513                 out = bkey_next(out);
514         }
515
516         return (u64 *) out - (u64 *) dst;
517 }
518
519 static inline int sort_extents_cmp(struct btree *b,
520                                    struct bkey_packed *l,
521                                    struct bkey_packed *r)
522 {
523         return bkey_cmp_packed(b, l, r) ?:
524                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
525 }
526
527 static unsigned sort_extents(struct bkey_packed *dst,
528                              struct sort_iter *iter,
529                              bool filter_whiteouts)
530 {
531         struct bkey_packed *in, *out = dst;
532
533         sort_iter_sort(iter, sort_extents_cmp);
534
535         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
536                 if (bkey_deleted(in))
537                         continue;
538
539                 if (bkey_whiteout(in) &&
540                     (filter_whiteouts || !in->needs_whiteout))
541                         continue;
542
543                 bkey_copy(out, in);
544                 out = bkey_next(out);
545         }
546
547         return (u64 *) out - (u64 *) dst;
548 }
549
550 static void btree_node_sort(struct bch_fs *c, struct btree *b,
551                             struct btree_iter *iter,
552                             unsigned start_idx,
553                             unsigned end_idx,
554                             bool filter_whiteouts)
555 {
556         struct btree_node *out;
557         struct sort_iter sort_iter;
558         struct bset_tree *t;
559         struct bset *start_bset = bset(b, &b->set[start_idx]);
560         bool used_mempool = false;
561         u64 start_time;
562         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
563         bool sorting_entire_node = start_idx == 0 &&
564                 end_idx == b->nsets;
565
566         sort_iter_init(&sort_iter, b);
567
568         for (t = b->set + start_idx;
569              t < b->set + end_idx;
570              t++) {
571                 u64s += le16_to_cpu(bset(b, t)->u64s);
572                 sort_iter_add(&sort_iter,
573                               btree_bkey_first(b, t),
574                               btree_bkey_last(b, t));
575         }
576
577         order = sorting_entire_node
578                 ? btree_page_order(c)
579                 : get_order(__vstruct_bytes(struct btree_node, u64s));
580
581         out = btree_bounce_alloc(c, order, &used_mempool);
582
583         start_time = local_clock();
584
585         if (btree_node_is_extents(b))
586                 filter_whiteouts = bset_written(b, start_bset);
587
588         u64s = btree_node_is_extents(b)
589                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
590                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
591
592         out->keys.u64s = cpu_to_le16(u64s);
593
594         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
595
596         if (sorting_entire_node)
597                 bch2_time_stats_update(&c->btree_sort_time, start_time);
598
599         /* Make sure we preserve bset journal_seq: */
600         for (t = b->set + start_idx + 1;
601              t < b->set + end_idx;
602              t++)
603                 start_bset->journal_seq =
604                         max(start_bset->journal_seq,
605                             bset(b, t)->journal_seq);
606
607         if (sorting_entire_node) {
608                 unsigned u64s = le16_to_cpu(out->keys.u64s);
609
610                 BUG_ON(order != btree_page_order(c));
611
612                 /*
613                  * Our temporary buffer is the same size as the btree node's
614                  * buffer, we can just swap buffers instead of doing a big
615                  * memcpy()
616                  */
617                 *out = *b->data;
618                 out->keys.u64s = cpu_to_le16(u64s);
619                 swap(out, b->data);
620                 set_btree_bset(b, b->set, &b->data->keys);
621         } else {
622                 start_bset->u64s = out->keys.u64s;
623                 memcpy_u64s(start_bset->start,
624                             out->keys.start,
625                             le16_to_cpu(out->keys.u64s));
626         }
627
628         for (i = start_idx + 1; i < end_idx; i++)
629                 b->nr.bset_u64s[start_idx] +=
630                         b->nr.bset_u64s[i];
631
632         b->nsets -= shift;
633
634         for (i = start_idx + 1; i < b->nsets; i++) {
635                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
636                 b->set[i]               = b->set[i + shift];
637         }
638
639         for (i = b->nsets; i < MAX_BSETS; i++)
640                 b->nr.bset_u64s[i] = 0;
641
642         set_btree_bset_end(b, &b->set[start_idx]);
643         bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
644
645         btree_bounce_free(c, order, used_mempool, out);
646
647         bch2_verify_btree_nr_keys(b);
648 }
649
650 /* Sort + repack in a new format: */
651 static struct btree_nr_keys sort_repack(struct bset *dst,
652                                         struct btree *src,
653                                         struct btree_node_iter *src_iter,
654                                         struct bkey_format *out_f,
655                                         bool filter_whiteouts)
656 {
657         struct bkey_format *in_f = &src->format;
658         struct bkey_packed *in, *out = vstruct_last(dst);
659         struct btree_nr_keys nr;
660
661         memset(&nr, 0, sizeof(nr));
662
663         while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
664                 if (filter_whiteouts && bkey_whiteout(in))
665                         continue;
666
667                 if (bch2_bkey_transform(out_f, out, bkey_packed(in)
668                                        ? in_f : &bch2_bkey_format_current, in))
669                         out->format = KEY_FORMAT_LOCAL_BTREE;
670                 else
671                         bch2_bkey_unpack(src, (void *) out, in);
672
673                 btree_keys_account_key_add(&nr, 0, out);
674                 out = bkey_next(out);
675         }
676
677         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
678         return nr;
679 }
680
681 /* Sort, repack, and merge: */
682 static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
683                                               struct bset *dst,
684                                               struct btree *src,
685                                               struct btree_node_iter *iter,
686                                               struct bkey_format *out_f,
687                                               bool filter_whiteouts,
688                                               key_filter_fn filter,
689                                               key_merge_fn merge)
690 {
691         struct bkey_packed *k, *prev = NULL, *out;
692         struct btree_nr_keys nr;
693         BKEY_PADDED(k) tmp;
694
695         memset(&nr, 0, sizeof(nr));
696
697         while ((k = bch2_btree_node_iter_next_all(iter, src))) {
698                 if (filter_whiteouts && bkey_whiteout(k))
699                         continue;
700
701                 /*
702                  * The filter might modify pointers, so we have to unpack the
703                  * key and values to &tmp.k:
704                  */
705                 bch2_bkey_unpack(src, &tmp.k, k);
706
707                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
708                         continue;
709
710                 /* prev is always unpacked, for key merging: */
711
712                 if (prev &&
713                     merge &&
714                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
715                         continue;
716
717                 /*
718                  * the current key becomes the new prev: advance prev, then
719                  * copy the current key - but first pack prev (in place):
720                  */
721                 if (prev) {
722                         bch2_bkey_pack(prev, (void *) prev, out_f);
723
724                         btree_keys_account_key_add(&nr, 0, prev);
725                         prev = bkey_next(prev);
726                 } else {
727                         prev = vstruct_last(dst);
728                 }
729
730                 bkey_copy(prev, &tmp.k);
731         }
732
733         if (prev) {
734                 bch2_bkey_pack(prev, (void *) prev, out_f);
735                 btree_keys_account_key_add(&nr, 0, prev);
736                 out = bkey_next(prev);
737         } else {
738                 out = vstruct_last(dst);
739         }
740
741         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
742         return nr;
743 }
744
745 void bch2_btree_sort_into(struct bch_fs *c,
746                          struct btree *dst,
747                          struct btree *src)
748 {
749         struct btree_nr_keys nr;
750         struct btree_node_iter src_iter;
751         u64 start_time = local_clock();
752
753         BUG_ON(dst->nsets != 1);
754
755         bch2_bset_set_no_aux_tree(dst, dst->set);
756
757         bch2_btree_node_iter_init_from_start(&src_iter, src,
758                                             btree_node_is_extents(src));
759
760         if (btree_node_ops(src)->key_normalize ||
761             btree_node_ops(src)->key_merge)
762                 nr = sort_repack_merge(c, btree_bset_first(dst),
763                                 src, &src_iter,
764                                 &dst->format,
765                                 true,
766                                 btree_node_ops(src)->key_normalize,
767                                 btree_node_ops(src)->key_merge);
768         else
769                 nr = sort_repack(btree_bset_first(dst),
770                                 src, &src_iter,
771                                 &dst->format,
772                                 true);
773
774         bch2_time_stats_update(&c->btree_sort_time, start_time);
775
776         set_btree_bset_end(dst, dst->set);
777
778         dst->nr.live_u64s       += nr.live_u64s;
779         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
780         dst->nr.packed_keys     += nr.packed_keys;
781         dst->nr.unpacked_keys   += nr.unpacked_keys;
782
783         bch2_verify_btree_nr_keys(dst);
784 }
785
786 #define SORT_CRIT       (4096 / sizeof(u64))
787
788 /*
789  * We're about to add another bset to the btree node, so if there's currently
790  * too many bsets - sort some of them together:
791  */
792 static bool btree_node_compact(struct bch_fs *c, struct btree *b,
793                                struct btree_iter *iter)
794 {
795         unsigned unwritten_idx;
796         bool ret = false;
797
798         for (unwritten_idx = 0;
799              unwritten_idx < b->nsets;
800              unwritten_idx++)
801                 if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
802                         break;
803
804         if (b->nsets - unwritten_idx > 1) {
805                 btree_node_sort(c, b, iter, unwritten_idx,
806                                 b->nsets, false);
807                 ret = true;
808         }
809
810         if (unwritten_idx > 1) {
811                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
812                 ret = true;
813         }
814
815         return ret;
816 }
817
818 void bch2_btree_build_aux_trees(struct btree *b)
819 {
820         struct bset_tree *t;
821
822         for_each_bset(b, t)
823                 bch2_bset_build_aux_tree(b, t,
824                                 bset_unwritten(b, bset(b, t)) &&
825                                 t == bset_tree_last(b));
826 }
827
828 /*
829  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
830  * inserted into
831  *
832  * Safe to call if there already is an unwritten bset - will only add a new bset
833  * if @b doesn't already have one.
834  *
835  * Returns true if we sorted (i.e. invalidated iterators
836  */
837 void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
838                          struct btree_iter *iter)
839 {
840         struct btree_node_entry *bne;
841         bool did_sort;
842
843         EBUG_ON(!(b->lock.state.seq & 1));
844         EBUG_ON(iter && iter->nodes[b->level] != b);
845
846         did_sort = btree_node_compact(c, b, iter);
847
848         bne = want_new_bset(c, b);
849         if (bne)
850                 bch2_bset_init_next(b, &bne->keys);
851
852         bch2_btree_build_aux_trees(b);
853
854         if (iter && did_sort)
855                 bch2_btree_iter_reinit_node(iter, b);
856 }
857
858 static struct nonce btree_nonce(struct btree *b,
859                                 struct bset *i,
860                                 unsigned offset)
861 {
862         return (struct nonce) {{
863                 [0] = cpu_to_le32(offset),
864                 [1] = ((__le32 *) &i->seq)[0],
865                 [2] = ((__le32 *) &i->seq)[1],
866                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
867         }};
868 }
869
870 static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
871 {
872         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
873                     vstruct_end(i) - (void *) i->_data);
874 }
875
876 #define btree_node_error(c, b, msg, ...)                                \
877 do {                                                                    \
878         if (write == READ &&                                            \
879             !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
880                 mustfix_fsck_err(c,                                     \
881                         "btree node read error at btree %u level %u/%u\n"\
882                         "pos %llu:%llu node offset %u bset u64s %u: " msg,\
883                         (b)->btree_id, (b)->level,                      \
884                         (c)->btree_roots[(b)->btree_id].level,          \
885                         (b)->key.k.p.inode, (b)->key.k.p.offset,        \
886                         (b)->written, le16_to_cpu((i)->u64s),           \
887                         ##__VA_ARGS__);                                 \
888         } else {                                                        \
889                 bch_err(c, "%s at btree %u level %u/%u\n"               \
890                         "pos %llu:%llu node offset %u bset u64s %u: " msg,\
891                         write == WRITE                                  \
892                         ? "corrupt metadata in btree node write"        \
893                         : "btree node error",                           \
894                         (b)->btree_id, (b)->level,                      \
895                         (c)->btree_roots[(b)->btree_id].level,          \
896                         (b)->key.k.p.inode, (b)->key.k.p.offset,        \
897                         (b)->written, le16_to_cpu((i)->u64s),           \
898                         ##__VA_ARGS__);                                 \
899                 ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
900                 goto fsck_err;                                          \
901         }                                                               \
902 } while (0)
903
904 static int validate_bset(struct bch_fs *c, struct btree *b,
905                          struct bset *i, unsigned sectors,
906                          unsigned *whiteout_u64s, int write)
907 {
908         struct bkey_packed *k, *prev = NULL;
909         struct bpos prev_pos = POS_MIN;
910         bool seen_non_whiteout = false;
911         int ret = 0;
912
913         if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
914                 btree_node_error(c, b, "unsupported bset version");
915                 i->u64s = 0;
916                 return 0;
917         }
918
919         if (b->written + sectors > c->sb.btree_node_size) {
920                 btree_node_error(c, b, "bset past end of btree node");
921                 i->u64s = 0;
922                 return 0;
923         }
924
925         if (b->written && !i->u64s)
926                 btree_node_error(c, b, "empty set");
927
928         if (!BSET_SEPARATE_WHITEOUTS(i)) {
929                 seen_non_whiteout = true;
930                 whiteout_u64s = 0;
931         }
932
933         for (k = i->start;
934              k != vstruct_last(i);) {
935                 struct bkey_s_c u;
936                 struct bkey tmp;
937                 const char *invalid;
938
939                 if (!k->u64s) {
940                         btree_node_error(c, b,
941                                 "KEY_U64s 0: %zu bytes of metadata lost",
942                                 vstruct_end(i) - (void *) k);
943
944                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
945                         break;
946                 }
947
948                 if (bkey_next(k) > vstruct_last(i)) {
949                         btree_node_error(c, b,
950                                          "key extends past end of bset");
951
952                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
953                         break;
954                 }
955
956                 if (k->format > KEY_FORMAT_CURRENT) {
957                         btree_node_error(c, b,
958                                          "invalid bkey format %u", k->format);
959
960                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
961                         memmove_u64s_down(k, bkey_next(k),
962                                           (u64 *) vstruct_end(i) - (u64 *) k);
963                         continue;
964                 }
965
966                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
967                         bch2_bkey_swab(btree_node_type(b), &b->format, k);
968
969                 u = bkey_disassemble(b, k, &tmp);
970
971                 invalid = bch2_btree_bkey_invalid(c, b, u);
972                 if (invalid) {
973                         char buf[160];
974
975                         bch2_bkey_val_to_text(c, btree_node_type(b),
976                                               buf, sizeof(buf), u);
977                         btree_node_error(c, b,
978                                          "invalid bkey %s: %s", buf, invalid);
979
980                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
981                         memmove_u64s_down(k, bkey_next(k),
982                                           (u64 *) vstruct_end(i) - (u64 *) k);
983                         continue;
984                 }
985
986                 /*
987                  * with the separate whiteouts thing (used for extents), the
988                  * second set of keys actually can have whiteouts too, so we
989                  * can't solely go off bkey_whiteout()...
990                  */
991
992                 if (!seen_non_whiteout &&
993                     (!bkey_whiteout(k) ||
994                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
995                         *whiteout_u64s = k->_data - i->_data;
996                         seen_non_whiteout = true;
997                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
998                         btree_node_error(c, b,
999                                          "keys out of order: %llu:%llu > %llu:%llu",
1000                                          prev_pos.inode,
1001                                          prev_pos.offset,
1002                                          u.k->p.inode,
1003                                          bkey_start_offset(u.k));
1004                         /* XXX: repair this */
1005                 }
1006
1007                 prev_pos = u.k->p;
1008                 prev = k;
1009                 k = bkey_next(k);
1010         }
1011
1012         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1013 fsck_err:
1014         return ret;
1015 }
1016
1017 int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
1018 {
1019         struct btree_node_entry *bne;
1020         struct bset *i = &b->data->keys;
1021         struct btree_node_iter *iter;
1022         struct btree_node *sorted;
1023         bool used_mempool;
1024         unsigned u64s;
1025         const char *err;
1026         struct bch_csum csum;
1027         struct nonce nonce;
1028         int ret, should_retry = 0, write = READ;
1029
1030         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1031         __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
1032
1033         err = "dynamic fault";
1034         if (bch2_meta_read_fault("btree"))
1035                 goto err;
1036
1037         while (b->written < c->sb.btree_node_size) {
1038                 unsigned sectors, whiteout_u64s = 0;
1039
1040                 if (!b->written) {
1041                         i = &b->data->keys;
1042
1043                         err = "bad magic";
1044                         if (le64_to_cpu(b->data->magic) != bset_magic(c))
1045                                 goto retry_err;
1046
1047                         err = "bad btree header";
1048                         if (!b->data->keys.seq)
1049                                 goto retry_err;
1050
1051                         err = "unknown checksum type";
1052                         if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1053                                 goto retry_err;
1054
1055                         nonce = btree_nonce(b, i, b->written << 9);
1056                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1057
1058                         err = "bad checksum";
1059                         if (bch2_crc_cmp(csum, b->data->csum))
1060                                 goto retry_err;
1061
1062                         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1063                                     &b->data->flags,
1064                                     (void *) &b->data->keys -
1065                                     (void *) &b->data->flags);
1066                         nonce = nonce_add(nonce,
1067                                           round_up((void *) &b->data->keys -
1068                                                    (void *) &b->data->flags,
1069                                                    CHACHA20_BLOCK_SIZE));
1070                         bset_encrypt(c, i, nonce);
1071
1072                         sectors = vstruct_sectors(b->data, c->block_bits);
1073
1074                         if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
1075                                 u64 *p = (u64 *) &b->data->ptr;
1076
1077                                 *p = swab64(*p);
1078                                 bch2_bpos_swab(&b->data->min_key);
1079                                 bch2_bpos_swab(&b->data->max_key);
1080                         }
1081
1082                         err = "incorrect btree id";
1083                         if (BTREE_NODE_ID(b->data) != b->btree_id)
1084                                 goto err;
1085
1086                         err = "incorrect level";
1087                         if (BTREE_NODE_LEVEL(b->data) != b->level)
1088                                 goto err;
1089
1090                         err = "incorrect max key";
1091                         if (bkey_cmp(b->data->max_key, b->key.k.p))
1092                                 goto err;
1093 #if 0
1094                         /*
1095                          * not correct anymore, due to btree node write error
1096                          * handling
1097                          *
1098                          * need to add b->data->seq to btree keys and verify
1099                          * against that
1100                          */
1101                         err = "incorrect backpointer";
1102                         if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
1103                                                  b->data->ptr))
1104                                 goto err;
1105 #endif
1106                         err = bch2_bkey_format_validate(&b->data->format);
1107                         if (err)
1108                                 goto err;
1109
1110                         set_btree_bset(b, b->set, &b->data->keys);
1111
1112                         btree_node_set_format(b, b->data->format);
1113                 } else {
1114                         bne = write_block(b);
1115                         i = &bne->keys;
1116
1117                         if (i->seq != b->data->keys.seq)
1118                                 break;
1119
1120                         err = "unknown checksum type";
1121                         if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
1122                                 goto retry_err;
1123
1124                         nonce = btree_nonce(b, i, b->written << 9);
1125                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1126
1127                         err = "bad checksum";
1128                         if (bch2_crc_cmp(csum, bne->csum))
1129                                 goto retry_err;
1130
1131                         bset_encrypt(c, i, nonce);
1132
1133                         sectors = vstruct_sectors(bne, c->block_bits);
1134                 }
1135
1136                 ret = validate_bset(c, b, i, sectors, &whiteout_u64s, READ);
1137                 if (ret)
1138                         goto fsck_err;
1139
1140                 b->written += sectors;
1141
1142                 err = "insufficient memory";
1143                 ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1144                 if (ret < 0)
1145                         goto err;
1146
1147                 if (ret)
1148                         continue;
1149
1150                 __bch2_btree_node_iter_push(iter, b,
1151                                            i->start,
1152                                            vstruct_idx(i, whiteout_u64s));
1153
1154                 __bch2_btree_node_iter_push(iter, b,
1155                                            vstruct_idx(i, whiteout_u64s),
1156                                            vstruct_last(i));
1157         }
1158
1159         err = "corrupted btree";
1160         for (bne = write_block(b);
1161              bset_byte_offset(b, bne) < btree_bytes(c);
1162              bne = (void *) bne + block_bytes(c))
1163                 if (bne->keys.seq == b->data->keys.seq)
1164                         goto err;
1165
1166         sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
1167         sorted->keys.u64s = 0;
1168
1169         b->nr = btree_node_is_extents(b)
1170                 ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1171                 : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
1172
1173         u64s = le16_to_cpu(sorted->keys.u64s);
1174         *sorted = *b->data;
1175         sorted->keys.u64s = cpu_to_le16(u64s);
1176         swap(sorted, b->data);
1177         set_btree_bset(b, b->set, &b->data->keys);
1178         b->nsets = 1;
1179
1180         BUG_ON(b->nr.live_u64s != u64s);
1181
1182         btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
1183
1184         bch2_bset_build_aux_tree(b, b->set, false);
1185
1186         set_needs_whiteout(btree_bset_first(b));
1187
1188         btree_node_reset_sib_u64s(b);
1189 out:
1190         mempool_free(iter, &c->fill_iter);
1191         return should_retry;
1192 err:
1193         btree_node_error(c, b, "%s", err);
1194 fsck_err:
1195         bch2_inconsistent_error(c);
1196         set_btree_node_read_error(b);
1197         goto out;
1198 retry_err:
1199         should_retry = -1;
1200         goto out;
1201 }
1202
1203 static void btree_node_read_work(struct work_struct *work)
1204 {
1205         struct btree_read_bio *rb =
1206                 container_of(work, struct btree_read_bio, work);
1207         struct bch_fs *c        = rb->c;
1208         struct bch_dev *ca      = rb->pick.ca;
1209         struct btree *b         = rb->bio.bi_private;
1210         struct bio *bio         = &rb->bio;
1211         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
1212         const struct bch_extent_ptr *ptr;
1213         struct bch_devs_mask avoid;
1214
1215         bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
1216         percpu_ref_put(&rb->pick.ca->io_ref);
1217
1218         if (!bio->bi_error &&
1219             !bch2_btree_node_read_done(c, b))
1220                 goto out;
1221
1222         goto err;
1223 out:
1224         bch2_time_stats_update(&c->btree_read_time, rb->start_time);
1225         bio_put(&rb->bio);
1226         clear_btree_node_read_in_flight(b);
1227         wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1228         return;
1229 err:
1230         memset(&avoid, 0, sizeof(avoid));
1231         __set_bit(ca->dev_idx, avoid.d);
1232
1233         extent_for_each_ptr(e, ptr) {
1234                 memset(&rb->pick, 0, sizeof(rb->pick));
1235                 bch2_get_read_device(c, e.k, ptr, NULL, &avoid, &rb->pick);
1236
1237                 if (!rb->pick.ca)
1238                         continue;
1239
1240                 bio_reset(bio);
1241                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1242                 bio->bi_bdev            = rb->pick.ca->disk_sb.bdev;
1243                 bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
1244                 bio->bi_iter.bi_size    = btree_bytes(c);
1245                 submit_bio_wait(bio);
1246
1247                 bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
1248                 percpu_ref_put(&rb->pick.ca->io_ref);
1249
1250                 if (!bio->bi_error &&
1251                     !bch2_btree_node_read_done(c, b))
1252                         goto out;
1253         }
1254
1255         set_btree_node_read_error(b);
1256         goto out;
1257 }
1258
1259 static void btree_node_read_endio(struct bio *bio)
1260 {
1261         struct btree_read_bio *rb =
1262                 container_of(bio, struct btree_read_bio, bio);
1263
1264         INIT_WORK(&rb->work, btree_node_read_work);
1265         schedule_work(&rb->work);
1266 }
1267
1268 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
1269                           bool sync)
1270 {
1271         struct extent_pick_ptr pick;
1272         struct btree_read_bio *rb;
1273         struct bio *bio;
1274
1275         trace_btree_read(c, b);
1276
1277         pick = bch2_btree_pick_ptr(c, b);
1278         if (bch2_fs_fatal_err_on(!pick.ca, c,
1279                         "btree node read error: no device to read from")) {
1280                 set_btree_node_read_error(b);
1281                 return;
1282         }
1283
1284         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
1285         rb = container_of(bio, struct btree_read_bio, bio);
1286         rb->c                   = c;
1287         rb->start_time          = local_clock();
1288         rb->pick                = pick;
1289         bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1290         bio->bi_bdev            = pick.ca->disk_sb.bdev;
1291         bio->bi_iter.bi_sector  = pick.ptr.offset;
1292         bio->bi_iter.bi_size    = btree_bytes(c);
1293         bch2_bio_map(bio, b->data);
1294
1295         this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
1296                      bio_sectors(bio));
1297
1298         set_btree_node_read_in_flight(b);
1299
1300         if (sync) {
1301                 submit_bio_wait(bio);
1302                 bio->bi_private = b;
1303                 btree_node_read_work(&rb->work);
1304         } else {
1305                 bio->bi_end_io  = btree_node_read_endio;
1306                 bio->bi_private = b;
1307                 submit_bio(bio);
1308         }
1309 }
1310
1311 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1312                         const struct bkey_i *k, unsigned level)
1313 {
1314         struct closure cl;
1315         struct btree *b;
1316         int ret;
1317
1318         closure_init_stack(&cl);
1319
1320         do {
1321                 ret = bch2_btree_node_cannibalize_lock(c, &cl);
1322                 closure_sync(&cl);
1323         } while (ret);
1324
1325         b = bch2_btree_node_mem_alloc(c);
1326         bch2_btree_node_cannibalize_unlock(c);
1327
1328         BUG_ON(IS_ERR(b));
1329
1330         bkey_copy(&b->key, k);
1331         BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
1332
1333         bch2_btree_node_read(c, b, true);
1334         six_unlock_write(&b->lock);
1335
1336         if (btree_node_read_error(b)) {
1337                 six_unlock_intent(&b->lock);
1338                 return -EIO;
1339         }
1340
1341         bch2_btree_set_root_for_read(c, b);
1342         six_unlock_intent(&b->lock);
1343
1344         return 0;
1345 }
1346
1347 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1348                               struct btree_write *w)
1349 {
1350         bch2_journal_pin_drop(&c->journal, &w->journal);
1351         closure_wake_up(&w->wait);
1352 }
1353
1354 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1355 {
1356         struct btree_write *w = btree_prev_write(b);
1357
1358         bch2_btree_complete_write(c, b, w);
1359         btree_node_io_unlock(b);
1360 }
1361
1362 static void bch2_btree_node_write_error(struct bch_fs *c,
1363                                         struct bch_write_bio *wbio)
1364 {
1365         struct btree *b         = wbio->bio.bi_private;
1366         struct closure *cl      = wbio->cl;
1367         __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1368         struct bkey_i_extent *new_key;
1369
1370         six_lock_read(&b->lock);
1371         bkey_copy(&tmp.k, &b->key);
1372         six_unlock_read(&b->lock);
1373
1374         if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
1375                 /* Node has been freed: */
1376                 goto out;
1377         }
1378
1379         new_key = bkey_i_to_extent(&tmp.k);
1380
1381         while (wbio->replicas_failed) {
1382                 unsigned idx = __fls(wbio->replicas_failed);
1383
1384                 bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
1385                 wbio->replicas_failed ^= 1 << idx;
1386         }
1387
1388         if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
1389             bch2_btree_node_update_key(c, b, new_key)) {
1390                 set_btree_node_noevict(b);
1391                 bch2_fatal_error(c);
1392         }
1393 out:
1394         bio_put(&wbio->bio);
1395         btree_node_write_done(c, b);
1396         if (cl)
1397                 closure_put(cl);
1398 }
1399
1400 void bch2_btree_write_error_work(struct work_struct *work)
1401 {
1402         struct bch_fs *c = container_of(work, struct bch_fs,
1403                                         btree_write_error_work);
1404         struct bio *bio;
1405
1406         while (1) {
1407                 spin_lock_irq(&c->btree_write_error_lock);
1408                 bio = bio_list_pop(&c->btree_write_error_list);
1409                 spin_unlock_irq(&c->btree_write_error_lock);
1410
1411                 if (!bio)
1412                         break;
1413
1414                 bch2_btree_node_write_error(c, to_wbio(bio));
1415         }
1416 }
1417
1418 static void btree_node_write_endio(struct bio *bio)
1419 {
1420         struct btree *b                 = bio->bi_private;
1421         struct bch_write_bio *wbio      = to_wbio(bio);
1422         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
1423         struct bch_write_bio *orig      = parent ?: wbio;
1424         struct closure *cl              = !wbio->split ? wbio->cl : NULL;
1425         struct bch_fs *c                = wbio->c;
1426         struct bch_dev *ca              = wbio->ca;
1427
1428         if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
1429             bch2_meta_write_fault("btree"))
1430                 set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
1431
1432         if (wbio->have_io_ref)
1433                 percpu_ref_put(&ca->io_ref);
1434
1435         if (parent) {
1436                 bio_put(bio);
1437                 bio_endio(&parent->bio);
1438                 return;
1439         }
1440
1441         btree_bounce_free(c,
1442                 wbio->order,
1443                 wbio->used_mempool,
1444                 wbio->data);
1445
1446         if (wbio->replicas_failed) {
1447                 unsigned long flags;
1448
1449                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1450                 bio_list_add(&c->btree_write_error_list, &wbio->bio);
1451                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1452                 queue_work(c->wq, &c->btree_write_error_work);
1453                 return;
1454         }
1455
1456         bio_put(bio);
1457         btree_node_write_done(c, b);
1458         if (cl)
1459                 closure_put(cl);
1460 }
1461
1462 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1463                                    struct bset *i, unsigned sectors)
1464 {
1465         const struct bch_extent_ptr *ptr;
1466         unsigned whiteout_u64s = 0;
1467         int ret;
1468
1469         extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
1470                 break;
1471
1472         ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE);
1473         if (ret)
1474                 bch2_inconsistent_error(c);
1475
1476         return ret;
1477 }
1478
1479 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1480                             struct closure *parent,
1481                             enum six_lock_type lock_type_held)
1482 {
1483         struct bch_write_bio *wbio;
1484         struct bset_tree *t;
1485         struct bset *i;
1486         struct btree_node *bn = NULL;
1487         struct btree_node_entry *bne = NULL;
1488         BKEY_PADDED(key) k;
1489         struct bkey_s_extent e;
1490         struct bch_extent_ptr *ptr;
1491         struct sort_iter sort_iter;
1492         struct nonce nonce;
1493         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1494         u64 seq = 0;
1495         bool used_mempool;
1496         unsigned long old, new;
1497         void *data;
1498
1499         /*
1500          * We may only have a read lock on the btree node - the dirty bit is our
1501          * "lock" against racing with other threads that may be trying to start
1502          * a write, we do a write iff we clear the dirty bit. Since setting the
1503          * dirty bit requires a write lock, we can't race with other threads
1504          * redirtying it:
1505          */
1506         do {
1507                 old = new = READ_ONCE(b->flags);
1508
1509                 if (!(old & (1 << BTREE_NODE_dirty)))
1510                         return;
1511
1512                 if (b->written &&
1513                     !btree_node_may_write(b))
1514                         return;
1515
1516                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1517                         btree_node_wait_on_io(b);
1518                         continue;
1519                 }
1520
1521                 new &= ~(1 << BTREE_NODE_dirty);
1522                 new &= ~(1 << BTREE_NODE_need_write);
1523                 new |=  (1 << BTREE_NODE_write_in_flight);
1524                 new |=  (1 << BTREE_NODE_just_written);
1525                 new ^=  (1 << BTREE_NODE_write_idx);
1526         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1527
1528         BUG_ON(!list_empty(&b->write_blocked));
1529         BUG_ON((b->will_make_reachable != NULL) != !b->written);
1530
1531         BUG_ON(b->written >= c->sb.btree_node_size);
1532         BUG_ON(bset_written(b, btree_bset_last(b)));
1533         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1534         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1535
1536         if (lock_type_held == SIX_LOCK_intent) {
1537                 six_lock_write(&b->lock);
1538                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
1539                 six_unlock_write(&b->lock);
1540         } else {
1541                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1542         }
1543
1544         BUG_ON(b->uncompacted_whiteout_u64s);
1545
1546         sort_iter_init(&sort_iter, b);
1547
1548         bytes = !b->written
1549                 ? sizeof(struct btree_node)
1550                 : sizeof(struct btree_node_entry);
1551
1552         bytes += b->whiteout_u64s * sizeof(u64);
1553
1554         for_each_bset(b, t) {
1555                 i = bset(b, t);
1556
1557                 if (bset_written(b, i))
1558                         continue;
1559
1560                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1561                 sort_iter_add(&sort_iter,
1562                               btree_bkey_first(b, t),
1563                               btree_bkey_last(b, t));
1564                 seq = max(seq, le64_to_cpu(i->journal_seq));
1565         }
1566
1567         order = get_order(bytes);
1568         data = btree_bounce_alloc(c, order, &used_mempool);
1569
1570         if (!b->written) {
1571                 bn = data;
1572                 *bn = *b->data;
1573                 i = &bn->keys;
1574         } else {
1575                 bne = data;
1576                 bne->keys = b->data->keys;
1577                 i = &bne->keys;
1578         }
1579
1580         i->journal_seq  = cpu_to_le64(seq);
1581         i->u64s         = 0;
1582
1583         if (!btree_node_is_extents(b)) {
1584                 sort_iter_add(&sort_iter,
1585                               unwritten_whiteouts_start(c, b),
1586                               unwritten_whiteouts_end(c, b));
1587                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1588         } else {
1589                 memcpy_u64s(i->start,
1590                             unwritten_whiteouts_start(c, b),
1591                             b->whiteout_u64s);
1592                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1593                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1594         }
1595
1596         b->whiteout_u64s = 0;
1597
1598         u64s = btree_node_is_extents(b)
1599                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1600                 : sort_keys(i->start, &sort_iter, false);
1601         le16_add_cpu(&i->u64s, u64s);
1602
1603         clear_needs_whiteout(i);
1604
1605         /* do we have data to write? */
1606         if (b->written && !i->u64s)
1607                 goto nowrite;
1608
1609         bytes_to_write = vstruct_end(i) - data;
1610         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1611
1612         memset(data + bytes_to_write, 0,
1613                (sectors_to_write << 9) - bytes_to_write);
1614
1615         BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
1616         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1617         BUG_ON(i->seq != b->data->keys.seq);
1618
1619         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1620         SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
1621
1622         nonce = btree_nonce(b, i, b->written << 9);
1623
1624         /* if we're going to be encrypting, check metadata validity first: */
1625         if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1626             validate_bset_for_write(c, b, i, sectors_to_write))
1627                 goto err;
1628
1629         if (bn) {
1630                 bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
1631                             &bn->flags,
1632                             (void *) &b->data->keys -
1633                             (void *) &b->data->flags);
1634                 nonce = nonce_add(nonce,
1635                                   round_up((void *) &b->data->keys -
1636                                            (void *) &b->data->flags,
1637                                            CHACHA20_BLOCK_SIZE));
1638                 bset_encrypt(c, i, nonce);
1639
1640                 nonce = btree_nonce(b, i, b->written << 9);
1641                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1642         } else {
1643                 bset_encrypt(c, i, nonce);
1644
1645                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1646         }
1647
1648         /* if we're not encrypting, check metadata after checksumming: */
1649         if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1650             validate_bset_for_write(c, b, i, sectors_to_write))
1651                 goto err;
1652
1653         /*
1654          * We handle btree write errors by immediately halting the journal -
1655          * after we've done that, we can't issue any subsequent btree writes
1656          * because they might have pointers to new nodes that failed to write.
1657          *
1658          * Furthermore, there's no point in doing any more btree writes because
1659          * with the journal stopped, we're never going to update the journal to
1660          * reflect that those writes were done and the data flushed from the
1661          * journal:
1662          *
1663          * Make sure to update b->written so bch2_btree_init_next() doesn't
1664          * break:
1665          */
1666         if (bch2_journal_error(&c->journal) ||
1667             c->opts.nochanges)
1668                 goto err;
1669
1670         trace_btree_write(b, bytes_to_write, sectors_to_write);
1671
1672         wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
1673         wbio->cl                = parent;
1674         wbio->order             = order;
1675         wbio->used_mempool      = used_mempool;
1676         wbio->data              = data;
1677         wbio->bio.bi_opf        = REQ_OP_WRITE|REQ_META|REQ_FUA;
1678         wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
1679         wbio->bio.bi_end_io     = btree_node_write_endio;
1680         wbio->bio.bi_private    = b;
1681
1682         if (parent)
1683                 closure_get(parent);
1684
1685         bch2_bio_map(&wbio->bio, data);
1686
1687         /*
1688          * If we're appending to a leaf node, we don't technically need FUA -
1689          * this write just needs to be persisted before the next journal write,
1690          * which will be marked FLUSH|FUA.
1691          *
1692          * Similarly if we're writing a new btree root - the pointer is going to
1693          * be in the next journal entry.
1694          *
1695          * But if we're writing a new btree node (that isn't a root) or
1696          * appending to a non leaf btree node, we need either FUA or a flush
1697          * when we write the parent with the new pointer. FUA is cheaper than a
1698          * flush, and writes appending to leaf nodes aren't blocking anything so
1699          * just make all btree node writes FUA to keep things sane.
1700          */
1701
1702         bkey_copy(&k.key, &b->key);
1703         e = bkey_i_to_s_extent(&k.key);
1704
1705         extent_for_each_ptr(e, ptr)
1706                 ptr->offset += b->written;
1707
1708         b->written += sectors_to_write;
1709
1710         bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
1711         return;
1712 err:
1713         set_btree_node_noevict(b);
1714         b->written += sectors_to_write;
1715 nowrite:
1716         btree_bounce_free(c, order, used_mempool, data);
1717         btree_node_write_done(c, b);
1718 }
1719
1720 /*
1721  * Work that must be done with write lock held:
1722  */
1723 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
1724 {
1725         bool invalidated_iter = false;
1726         struct btree_node_entry *bne;
1727         struct bset_tree *t;
1728
1729         if (!btree_node_just_written(b))
1730                 return false;
1731
1732         BUG_ON(b->whiteout_u64s);
1733         BUG_ON(b->uncompacted_whiteout_u64s);
1734
1735         clear_btree_node_just_written(b);
1736
1737         /*
1738          * Note: immediately after write, bset_unwritten()/bset_written() don't
1739          * work - the amount of data we had to write after compaction might have
1740          * been smaller than the offset of the last bset.
1741          *
1742          * However, we know that all bsets have been written here, as long as
1743          * we're still holding the write lock:
1744          */
1745
1746         /*
1747          * XXX: decide if we really want to unconditionally sort down to a
1748          * single bset:
1749          */
1750         if (b->nsets > 1) {
1751                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1752                 invalidated_iter = true;
1753         } else {
1754                 invalidated_iter = bch2_drop_whiteouts(b);
1755         }
1756
1757         for_each_bset(b, t)
1758                 set_needs_whiteout(bset(b, t));
1759
1760         bch2_btree_verify(c, b);
1761
1762         /*
1763          * If later we don't unconditionally sort down to a single bset, we have
1764          * to ensure this is still true:
1765          */
1766         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1767
1768         bne = want_new_bset(c, b);
1769         if (bne)
1770                 bch2_bset_init_next(b, &bne->keys);
1771
1772         bch2_btree_build_aux_trees(b);
1773
1774         return invalidated_iter;
1775 }
1776
1777 /*
1778  * Use this one if the node is intent locked:
1779  */
1780 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1781                           struct closure *parent,
1782                           enum six_lock_type lock_type_held)
1783 {
1784         BUG_ON(lock_type_held == SIX_LOCK_write);
1785
1786         if (lock_type_held == SIX_LOCK_intent ||
1787             six_trylock_convert(&b->lock, SIX_LOCK_read,
1788                                 SIX_LOCK_intent)) {
1789                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
1790
1791                 /* don't cycle lock unnecessarily: */
1792                 if (btree_node_just_written(b)) {
1793                         six_lock_write(&b->lock);
1794                         bch2_btree_post_write_cleanup(c, b);
1795                         six_unlock_write(&b->lock);
1796                 }
1797
1798                 if (lock_type_held == SIX_LOCK_read)
1799                         six_lock_downgrade(&b->lock);
1800         } else {
1801                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
1802         }
1803 }
1804
1805 void bch2_btree_verify_flushed(struct bch_fs *c)
1806 {
1807         struct bucket_table *tbl;
1808         struct rhash_head *pos;
1809         struct btree *b;
1810         unsigned i;
1811
1812         rcu_read_lock();
1813         tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
1814                                   &c->btree_cache_table);
1815
1816         for (i = 0; i < tbl->size; i++)
1817                 rht_for_each_entry_rcu(b, pos, tbl, i, hash)
1818                         BUG_ON(btree_node_dirty(b));
1819         rcu_read_unlock();
1820 }