]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_io.c
Update bcachefs sources to 02ae70070a bcachefs: Allocate new btree roots lazily
[bcachefs-tools-debian] / libbcachefs / btree_io.c
1
2 #include "bcachefs.h"
3 #include "bkey_methods.h"
4 #include "btree_cache.h"
5 #include "btree_io.h"
6 #include "btree_iter.h"
7 #include "btree_locking.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
10 #include "buckets.h"
11 #include "checksum.h"
12 #include "debug.h"
13 #include "error.h"
14 #include "extents.h"
15 #include "io.h"
16 #include "journal.h"
17 #include "super-io.h"
18
19 #include <trace/events/bcachefs.h>
20
21 static void verify_no_dups(struct btree *b,
22                            struct bkey_packed *start,
23                            struct bkey_packed *end)
24 {
25 #ifdef CONFIG_BCACHEFS_DEBUG
26         struct bkey_packed *k;
27
28         for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
29                 struct bkey l = bkey_unpack_key(b, k);
30                 struct bkey r = bkey_unpack_key(b, bkey_next(k));
31
32                 BUG_ON(btree_node_is_extents(b)
33                        ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
34                        : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
35                 //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
36         }
37 #endif
38 }
39
40 static void clear_needs_whiteout(struct bset *i)
41 {
42         struct bkey_packed *k;
43
44         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
45                 k->needs_whiteout = false;
46 }
47
48 static void set_needs_whiteout(struct bset *i)
49 {
50         struct bkey_packed *k;
51
52         for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
53                 k->needs_whiteout = true;
54 }
55
56 static void btree_bounce_free(struct bch_fs *c, unsigned order,
57                               bool used_mempool, void *p)
58 {
59         if (used_mempool)
60                 mempool_free(p, &c->btree_bounce_pool);
61         else
62                 vpfree(p, PAGE_SIZE << order);
63 }
64
65 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
66                                 bool *used_mempool)
67 {
68         void *p;
69
70         BUG_ON(order > btree_page_order(c));
71
72         *used_mempool = false;
73         p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
74         if (p)
75                 return p;
76
77         *used_mempool = true;
78         return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
79 }
80
81 typedef int (*sort_cmp_fn)(struct btree *,
82                            struct bkey_packed *,
83                            struct bkey_packed *);
84
85 struct sort_iter {
86         struct btree    *b;
87         unsigned                used;
88
89         struct sort_iter_set {
90                 struct bkey_packed *k, *end;
91         } data[MAX_BSETS + 1];
92 };
93
94 static void sort_iter_init(struct sort_iter *iter, struct btree *b)
95 {
96         memset(iter, 0, sizeof(*iter));
97         iter->b = b;
98 }
99
100 static inline void __sort_iter_sift(struct sort_iter *iter,
101                                     unsigned from,
102                                     sort_cmp_fn cmp)
103 {
104         unsigned i;
105
106         for (i = from;
107              i + 1 < iter->used &&
108              cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
109              i++)
110                 swap(iter->data[i], iter->data[i + 1]);
111 }
112
113 static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp)
114 {
115
116         __sort_iter_sift(iter, 0, cmp);
117 }
118
119 static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
120 {
121         unsigned i = iter->used;
122
123         while (i--)
124                 __sort_iter_sift(iter, i, cmp);
125 }
126
127 static void sort_iter_add(struct sort_iter *iter,
128                           struct bkey_packed *k,
129                           struct bkey_packed *end)
130 {
131         BUG_ON(iter->used >= ARRAY_SIZE(iter->data));
132
133         if (k != end)
134                 iter->data[iter->used++] = (struct sort_iter_set) { k, end };
135 }
136
137 static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
138 {
139         return iter->used ? iter->data->k : NULL;
140 }
141
142 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
143 {
144         iter->data->k = bkey_next(iter->data->k);
145
146         BUG_ON(iter->data->k > iter->data->end);
147
148         if (iter->data->k == iter->data->end)
149                 array_remove_item(iter->data, iter->used, 0);
150         else
151                 sort_iter_sift(iter, cmp);
152 }
153
154 static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
155                                                  sort_cmp_fn cmp)
156 {
157         struct bkey_packed *ret = sort_iter_peek(iter);
158
159         if (ret)
160                 sort_iter_advance(iter, cmp);
161
162         return ret;
163 }
164
165 static inline int sort_key_whiteouts_cmp(struct btree *b,
166                                          struct bkey_packed *l,
167                                          struct bkey_packed *r)
168 {
169         return bkey_cmp_packed(b, l, r);
170 }
171
172 static unsigned sort_key_whiteouts(struct bkey_packed *dst,
173                                    struct sort_iter *iter)
174 {
175         struct bkey_packed *in, *out = dst;
176
177         sort_iter_sort(iter, sort_key_whiteouts_cmp);
178
179         while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) {
180                 bkey_copy(out, in);
181                 out = bkey_next(out);
182         }
183
184         return (u64 *) out - (u64 *) dst;
185 }
186
187 static inline int sort_extent_whiteouts_cmp(struct btree *b,
188                                             struct bkey_packed *l,
189                                             struct bkey_packed *r)
190 {
191         struct bkey ul = bkey_unpack_key(b, l);
192         struct bkey ur = bkey_unpack_key(b, r);
193
194         return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur));
195 }
196
197 static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
198                                       struct sort_iter *iter)
199 {
200         const struct bkey_format *f = &iter->b->format;
201         struct bkey_packed *in, *out = dst;
202         struct bkey_i l, r;
203         bool prev = false, l_packed = false;
204         u64 max_packed_size     = bkey_field_max(f, BKEY_FIELD_SIZE);
205         u64 max_packed_offset   = bkey_field_max(f, BKEY_FIELD_OFFSET);
206         u64 new_size;
207
208         max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX);
209
210         sort_iter_sort(iter, sort_extent_whiteouts_cmp);
211
212         while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
213                 EBUG_ON(bkeyp_val_u64s(f, in));
214                 EBUG_ON(in->type != KEY_TYPE_DISCARD);
215
216                 r.k = bkey_unpack_key(iter->b, in);
217
218                 if (prev &&
219                     bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) {
220                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
221                                 continue;
222
223                         new_size = l_packed
224                                 ? min(max_packed_size, max_packed_offset -
225                                       bkey_start_offset(&l.k))
226                                 : KEY_SIZE_MAX;
227
228                         new_size = min(new_size, r.k.p.offset -
229                                        bkey_start_offset(&l.k));
230
231                         BUG_ON(new_size < l.k.size);
232
233                         bch2_key_resize(&l.k, new_size);
234
235                         if (bkey_cmp(l.k.p, r.k.p) >= 0)
236                                 continue;
237
238                         bch2_cut_front(l.k.p, &r);
239                 }
240
241                 if (prev) {
242                         if (!bch2_bkey_pack(out, &l, f)) {
243                                 BUG_ON(l_packed);
244                                 bkey_copy(out, &l);
245                         }
246                         out = bkey_next(out);
247                 }
248
249                 l = r;
250                 prev = true;
251                 l_packed = bkey_packed(in);
252         }
253
254         if (prev) {
255                 if (!bch2_bkey_pack(out, &l, f)) {
256                         BUG_ON(l_packed);
257                         bkey_copy(out, &l);
258                 }
259                 out = bkey_next(out);
260         }
261
262         return (u64 *) out - (u64 *) dst;
263 }
264
265 static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
266                                     bool compacting,
267                                     enum compact_mode mode)
268 {
269         unsigned live_u64s = b->nr.bset_u64s[t - b->set];
270         unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
271
272         if (live_u64s == bset_u64s)
273                 return 0;
274
275         if (mode == COMPACT_LAZY) {
276                 if (live_u64s * 4 < bset_u64s * 3 ||
277                     (compacting && bset_unwritten(b, bset(b, t))))
278                         return bset_u64s - live_u64s;
279         } else {
280                 if (bset_written(b, bset(b, t)))
281                         return bset_u64s - live_u64s;
282         }
283
284         return 0;
285 }
286
287 bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
288                              enum compact_mode mode)
289 {
290         const struct bkey_format *f = &b->format;
291         struct bset_tree *t;
292         struct bkey_packed *whiteouts = NULL;
293         struct bkey_packed *u_start, *u_pos;
294         struct sort_iter sort_iter;
295         unsigned order, whiteout_u64s = 0, u64s;
296         bool used_mempool, compacting = false;
297
298         for_each_bset(b, t)
299                 whiteout_u64s += should_compact_bset(b, t,
300                                         whiteout_u64s != 0, mode);
301
302         if (!whiteout_u64s)
303                 return false;
304
305         sort_iter_init(&sort_iter, b);
306
307         whiteout_u64s += b->whiteout_u64s;
308         order = get_order(whiteout_u64s * sizeof(u64));
309
310         whiteouts = btree_bounce_alloc(c, order, &used_mempool);
311         u_start = u_pos = whiteouts;
312
313         memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b),
314                     b->whiteout_u64s);
315         u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64);
316
317         sort_iter_add(&sort_iter, u_start, u_pos);
318
319         for_each_bset(b, t) {
320                 struct bset *i = bset(b, t);
321                 struct bkey_packed *k, *n, *out, *start, *end;
322                 struct btree_node_entry *src = NULL, *dst = NULL;
323
324                 if (t != b->set && bset_unwritten(b, i)) {
325                         src = container_of(i, struct btree_node_entry, keys);
326                         dst = max(write_block(b),
327                                   (void *) btree_bkey_last(b, t -1));
328                 }
329
330                 if (!should_compact_bset(b, t, compacting, mode)) {
331                         if (src != dst) {
332                                 memmove(dst, src, sizeof(*src) +
333                                         le16_to_cpu(src->keys.u64s) *
334                                         sizeof(u64));
335                                 i = &dst->keys;
336                                 set_btree_bset(b, t, i);
337                         }
338                         continue;
339                 }
340
341                 compacting = true;
342                 u_start = u_pos;
343                 start = i->start;
344                 end = vstruct_last(i);
345
346                 if (src != dst) {
347                         memmove(dst, src, sizeof(*src));
348                         i = &dst->keys;
349                         set_btree_bset(b, t, i);
350                 }
351
352                 out = i->start;
353
354                 for (k = start; k != end; k = n) {
355                         n = bkey_next(k);
356
357                         if (bkey_deleted(k) && btree_node_is_extents(b))
358                                 continue;
359
360                         if (bkey_whiteout(k) && !k->needs_whiteout)
361                                 continue;
362
363                         if (bkey_whiteout(k)) {
364                                 unreserve_whiteout(b, t, k);
365                                 memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
366                                 set_bkeyp_val_u64s(f, u_pos, 0);
367                                 u_pos = bkey_next(u_pos);
368                         } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
369                                 bkey_copy(out, k);
370                                 out = bkey_next(out);
371                         }
372                 }
373
374                 sort_iter_add(&sort_iter, u_start, u_pos);
375
376                 if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) {
377                         i->u64s = cpu_to_le16((u64 *) out - i->_data);
378                         set_btree_bset_end(b, t);
379                         bch2_bset_set_no_aux_tree(b, t);
380                 }
381         }
382
383         b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts;
384
385         BUG_ON((void *) unwritten_whiteouts_start(c, b) <
386                (void *) btree_bkey_last(b, bset_tree_last(b)));
387
388         u64s = btree_node_is_extents(b)
389                 ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b),
390                                         &sort_iter)
391                 : sort_key_whiteouts(unwritten_whiteouts_start(c, b),
392                                      &sort_iter);
393
394         BUG_ON(u64s > b->whiteout_u64s);
395         BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b));
396         BUG_ON(u_pos != whiteouts && !u64s);
397
398         if (u64s != b->whiteout_u64s) {
399                 void *src = unwritten_whiteouts_start(c, b);
400
401                 b->whiteout_u64s = u64s;
402                 memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s);
403         }
404
405         verify_no_dups(b,
406                        unwritten_whiteouts_start(c, b),
407                        unwritten_whiteouts_end(c, b));
408
409         btree_bounce_free(c, order, used_mempool, whiteouts);
410
411         if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK)
412                 bch2_btree_build_aux_trees(b);
413
414         bch_btree_keys_u64s_remaining(c, b);
415         bch2_verify_btree_nr_keys(b);
416
417         return true;
418 }
419
420 static bool bch2_drop_whiteouts(struct btree *b)
421 {
422         struct bset_tree *t;
423         bool ret = false;
424
425         for_each_bset(b, t) {
426                 struct bset *i = bset(b, t);
427                 struct bkey_packed *k, *n, *out, *start, *end;
428
429                 if (!should_compact_bset(b, t, true, true))
430                         continue;
431
432                 start   = btree_bkey_first(b, t);
433                 end     = btree_bkey_last(b, t);
434
435                 if (bset_unwritten(b, i) &&
436                     t != b->set) {
437                         struct bset *dst =
438                                max_t(struct bset *, write_block(b),
439                                      (void *) btree_bkey_last(b, t -1));
440
441                         memmove(dst, i, sizeof(struct bset));
442                         i = dst;
443                         set_btree_bset(b, t, i);
444                 }
445
446                 out = i->start;
447
448                 for (k = start; k != end; k = n) {
449                         n = bkey_next(k);
450
451                         if (!bkey_whiteout(k)) {
452                                 bkey_copy(out, k);
453                                 out = bkey_next(out);
454                         }
455                 }
456
457                 i->u64s = cpu_to_le16((u64 *) out - i->_data);
458                 bch2_bset_set_no_aux_tree(b, t);
459                 ret = true;
460         }
461
462         bch2_verify_btree_nr_keys(b);
463
464         return ret;
465 }
466
467 static inline int sort_keys_cmp(struct btree *b,
468                                 struct bkey_packed *l,
469                                 struct bkey_packed *r)
470 {
471         return bkey_cmp_packed(b, l, r) ?:
472                 (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?:
473                 (int) l->needs_whiteout - (int) r->needs_whiteout;
474 }
475
476 static unsigned sort_keys(struct bkey_packed *dst,
477                           struct sort_iter *iter,
478                           bool filter_whiteouts)
479 {
480         const struct bkey_format *f = &iter->b->format;
481         struct bkey_packed *in, *next, *out = dst;
482
483         sort_iter_sort(iter, sort_keys_cmp);
484
485         while ((in = sort_iter_next(iter, sort_keys_cmp))) {
486                 if (bkey_whiteout(in) &&
487                     (filter_whiteouts || !in->needs_whiteout))
488                         continue;
489
490                 if (bkey_whiteout(in) &&
491                     (next = sort_iter_peek(iter)) &&
492                     !bkey_cmp_packed(iter->b, in, next)) {
493                         BUG_ON(in->needs_whiteout &&
494                                next->needs_whiteout);
495                         /*
496                          * XXX racy, called with read lock from write path
497                          *
498                          * leads to spurious BUG_ON() in bkey_unpack_key() in
499                          * debug mode
500                          */
501                         next->needs_whiteout |= in->needs_whiteout;
502                         continue;
503                 }
504
505                 if (bkey_whiteout(in)) {
506                         memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
507                         set_bkeyp_val_u64s(f, out, 0);
508                 } else {
509                         bkey_copy(out, in);
510                 }
511                 out = bkey_next(out);
512         }
513
514         return (u64 *) out - (u64 *) dst;
515 }
516
517 static inline int sort_extents_cmp(struct btree *b,
518                                    struct bkey_packed *l,
519                                    struct bkey_packed *r)
520 {
521         return bkey_cmp_packed(b, l, r) ?:
522                 (int) bkey_deleted(l) - (int) bkey_deleted(r);
523 }
524
525 static unsigned sort_extents(struct bkey_packed *dst,
526                              struct sort_iter *iter,
527                              bool filter_whiteouts)
528 {
529         struct bkey_packed *in, *out = dst;
530
531         sort_iter_sort(iter, sort_extents_cmp);
532
533         while ((in = sort_iter_next(iter, sort_extents_cmp))) {
534                 if (bkey_deleted(in))
535                         continue;
536
537                 if (bkey_whiteout(in) &&
538                     (filter_whiteouts || !in->needs_whiteout))
539                         continue;
540
541                 bkey_copy(out, in);
542                 out = bkey_next(out);
543         }
544
545         return (u64 *) out - (u64 *) dst;
546 }
547
548 static void btree_node_sort(struct bch_fs *c, struct btree *b,
549                             struct btree_iter *iter,
550                             unsigned start_idx,
551                             unsigned end_idx,
552                             bool filter_whiteouts)
553 {
554         struct btree_node *out;
555         struct sort_iter sort_iter;
556         struct bset_tree *t;
557         struct bset *start_bset = bset(b, &b->set[start_idx]);
558         bool used_mempool = false;
559         u64 start_time, seq = 0;
560         unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
561         bool sorting_entire_node = start_idx == 0 &&
562                 end_idx == b->nsets;
563
564         sort_iter_init(&sort_iter, b);
565
566         for (t = b->set + start_idx;
567              t < b->set + end_idx;
568              t++) {
569                 u64s += le16_to_cpu(bset(b, t)->u64s);
570                 sort_iter_add(&sort_iter,
571                               btree_bkey_first(b, t),
572                               btree_bkey_last(b, t));
573         }
574
575         order = sorting_entire_node
576                 ? btree_page_order(c)
577                 : get_order(__vstruct_bytes(struct btree_node, u64s));
578
579         out = btree_bounce_alloc(c, order, &used_mempool);
580
581         start_time = local_clock();
582
583         if (btree_node_is_extents(b))
584                 filter_whiteouts = bset_written(b, start_bset);
585
586         u64s = btree_node_is_extents(b)
587                 ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts)
588                 : sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
589
590         out->keys.u64s = cpu_to_le16(u64s);
591
592         BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
593
594         if (sorting_entire_node)
595                 bch2_time_stats_update(&c->btree_sort_time, start_time);
596
597         /* Make sure we preserve bset journal_seq: */
598         for (t = b->set + start_idx; t < b->set + end_idx; t++)
599                 seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
600         start_bset->journal_seq = cpu_to_le64(seq);
601
602         if (sorting_entire_node) {
603                 unsigned u64s = le16_to_cpu(out->keys.u64s);
604
605                 BUG_ON(order != btree_page_order(c));
606
607                 /*
608                  * Our temporary buffer is the same size as the btree node's
609                  * buffer, we can just swap buffers instead of doing a big
610                  * memcpy()
611                  */
612                 *out = *b->data;
613                 out->keys.u64s = cpu_to_le16(u64s);
614                 swap(out, b->data);
615                 set_btree_bset(b, b->set, &b->data->keys);
616         } else {
617                 start_bset->u64s = out->keys.u64s;
618                 memcpy_u64s(start_bset->start,
619                             out->keys.start,
620                             le16_to_cpu(out->keys.u64s));
621         }
622
623         for (i = start_idx + 1; i < end_idx; i++)
624                 b->nr.bset_u64s[start_idx] +=
625                         b->nr.bset_u64s[i];
626
627         b->nsets -= shift;
628
629         for (i = start_idx + 1; i < b->nsets; i++) {
630                 b->nr.bset_u64s[i]      = b->nr.bset_u64s[i + shift];
631                 b->set[i]               = b->set[i + shift];
632         }
633
634         for (i = b->nsets; i < MAX_BSETS; i++)
635                 b->nr.bset_u64s[i] = 0;
636
637         set_btree_bset_end(b, &b->set[start_idx]);
638         bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
639
640         btree_bounce_free(c, order, used_mempool, out);
641
642         bch2_verify_btree_nr_keys(b);
643 }
644
645 /* Sort + repack in a new format: */
646 static struct btree_nr_keys sort_repack(struct bset *dst,
647                                         struct btree *src,
648                                         struct btree_node_iter *src_iter,
649                                         struct bkey_format *out_f,
650                                         bool filter_whiteouts)
651 {
652         struct bkey_format *in_f = &src->format;
653         struct bkey_packed *in, *out = vstruct_last(dst);
654         struct btree_nr_keys nr;
655
656         memset(&nr, 0, sizeof(nr));
657
658         while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
659                 if (filter_whiteouts && bkey_whiteout(in))
660                         continue;
661
662                 if (bch2_bkey_transform(out_f, out, bkey_packed(in)
663                                        ? in_f : &bch2_bkey_format_current, in))
664                         out->format = KEY_FORMAT_LOCAL_BTREE;
665                 else
666                         bch2_bkey_unpack(src, (void *) out, in);
667
668                 btree_keys_account_key_add(&nr, 0, out);
669                 out = bkey_next(out);
670         }
671
672         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
673         return nr;
674 }
675
676 /* Sort, repack, and merge: */
677 static struct btree_nr_keys sort_repack_merge(struct bch_fs *c,
678                                               struct bset *dst,
679                                               struct btree *src,
680                                               struct btree_node_iter *iter,
681                                               struct bkey_format *out_f,
682                                               bool filter_whiteouts,
683                                               key_filter_fn filter,
684                                               key_merge_fn merge)
685 {
686         struct bkey_packed *k, *prev = NULL, *out;
687         struct btree_nr_keys nr;
688         BKEY_PADDED(k) tmp;
689
690         memset(&nr, 0, sizeof(nr));
691
692         while ((k = bch2_btree_node_iter_next_all(iter, src))) {
693                 if (filter_whiteouts && bkey_whiteout(k))
694                         continue;
695
696                 /*
697                  * The filter might modify pointers, so we have to unpack the
698                  * key and values to &tmp.k:
699                  */
700                 bch2_bkey_unpack(src, &tmp.k, k);
701
702                 if (filter && filter(c, src, bkey_i_to_s(&tmp.k)))
703                         continue;
704
705                 /* prev is always unpacked, for key merging: */
706
707                 if (prev &&
708                     merge &&
709                     merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE)
710                         continue;
711
712                 /*
713                  * the current key becomes the new prev: advance prev, then
714                  * copy the current key - but first pack prev (in place):
715                  */
716                 if (prev) {
717                         bch2_bkey_pack(prev, (void *) prev, out_f);
718
719                         btree_keys_account_key_add(&nr, 0, prev);
720                         prev = bkey_next(prev);
721                 } else {
722                         prev = vstruct_last(dst);
723                 }
724
725                 bkey_copy(prev, &tmp.k);
726         }
727
728         if (prev) {
729                 bch2_bkey_pack(prev, (void *) prev, out_f);
730                 btree_keys_account_key_add(&nr, 0, prev);
731                 out = bkey_next(prev);
732         } else {
733                 out = vstruct_last(dst);
734         }
735
736         dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
737         return nr;
738 }
739
740 void bch2_btree_sort_into(struct bch_fs *c,
741                          struct btree *dst,
742                          struct btree *src)
743 {
744         struct btree_nr_keys nr;
745         struct btree_node_iter src_iter;
746         u64 start_time = local_clock();
747
748         BUG_ON(dst->nsets != 1);
749
750         bch2_bset_set_no_aux_tree(dst, dst->set);
751
752         bch2_btree_node_iter_init_from_start(&src_iter, src,
753                                             btree_node_is_extents(src));
754
755         if (btree_node_ops(src)->key_normalize ||
756             btree_node_ops(src)->key_merge)
757                 nr = sort_repack_merge(c, btree_bset_first(dst),
758                                 src, &src_iter,
759                                 &dst->format,
760                                 true,
761                                 btree_node_ops(src)->key_normalize,
762                                 btree_node_ops(src)->key_merge);
763         else
764                 nr = sort_repack(btree_bset_first(dst),
765                                 src, &src_iter,
766                                 &dst->format,
767                                 true);
768
769         bch2_time_stats_update(&c->btree_sort_time, start_time);
770
771         set_btree_bset_end(dst, dst->set);
772
773         dst->nr.live_u64s       += nr.live_u64s;
774         dst->nr.bset_u64s[0]    += nr.bset_u64s[0];
775         dst->nr.packed_keys     += nr.packed_keys;
776         dst->nr.unpacked_keys   += nr.unpacked_keys;
777
778         bch2_verify_btree_nr_keys(dst);
779 }
780
781 #define SORT_CRIT       (4096 / sizeof(u64))
782
783 /*
784  * We're about to add another bset to the btree node, so if there's currently
785  * too many bsets - sort some of them together:
786  */
787 static bool btree_node_compact(struct bch_fs *c, struct btree *b,
788                                struct btree_iter *iter)
789 {
790         unsigned unwritten_idx;
791         bool ret = false;
792
793         for (unwritten_idx = 0;
794              unwritten_idx < b->nsets;
795              unwritten_idx++)
796                 if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
797                         break;
798
799         if (b->nsets - unwritten_idx > 1) {
800                 btree_node_sort(c, b, iter, unwritten_idx,
801                                 b->nsets, false);
802                 ret = true;
803         }
804
805         if (unwritten_idx > 1) {
806                 btree_node_sort(c, b, iter, 0, unwritten_idx, false);
807                 ret = true;
808         }
809
810         return ret;
811 }
812
813 void bch2_btree_build_aux_trees(struct btree *b)
814 {
815         struct bset_tree *t;
816
817         for_each_bset(b, t)
818                 bch2_bset_build_aux_tree(b, t,
819                                 bset_unwritten(b, bset(b, t)) &&
820                                 t == bset_tree_last(b));
821 }
822
823 /*
824  * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
825  * inserted into
826  *
827  * Safe to call if there already is an unwritten bset - will only add a new bset
828  * if @b doesn't already have one.
829  *
830  * Returns true if we sorted (i.e. invalidated iterators
831  */
832 void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
833                          struct btree_iter *iter)
834 {
835         struct btree_node_entry *bne;
836         bool did_sort;
837
838         EBUG_ON(!(b->lock.state.seq & 1));
839         EBUG_ON(iter && iter->nodes[b->level] != b);
840
841         did_sort = btree_node_compact(c, b, iter);
842
843         bne = want_new_bset(c, b);
844         if (bne)
845                 bch2_bset_init_next(b, &bne->keys);
846
847         bch2_btree_build_aux_trees(b);
848
849         if (iter && did_sort)
850                 bch2_btree_iter_reinit_node(iter, b);
851 }
852
853 static struct nonce btree_nonce(struct bset *i, unsigned offset)
854 {
855         return (struct nonce) {{
856                 [0] = cpu_to_le32(offset),
857                 [1] = ((__le32 *) &i->seq)[0],
858                 [2] = ((__le32 *) &i->seq)[1],
859                 [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
860         }};
861 }
862
863 static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
864 {
865         struct nonce nonce = btree_nonce(i, offset);
866
867         if (!offset) {
868                 struct btree_node *bn = container_of(i, struct btree_node, keys);
869                 unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
870
871                 bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
872                              bytes);
873
874                 nonce = nonce_add(nonce, round_up(bytes, CHACHA20_BLOCK_SIZE));
875         }
876
877         bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
878                      vstruct_end(i) - (void *) i->_data);
879 }
880
881 static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
882                          unsigned offset, int write, char *buf, size_t len)
883 {
884         char *out = buf, *end = buf + len;
885
886         out += scnprintf(out, end - out,
887                          "error validating btree node %s "
888                          "at btree %u level %u/%u\n"
889                          "pos %llu:%llu node offset %u",
890                          write ? "before write " : "",
891                          b->btree_id, b->level,
892                          c->btree_roots[b->btree_id].level,
893                          b->key.k.p.inode, b->key.k.p.offset,
894                          b->written);
895         if (i)
896                 out += scnprintf(out, end - out,
897                                  " bset u64s %u",
898                                  le16_to_cpu(i->u64s));
899
900         return out - buf;
901 }
902
903 enum btree_err_type {
904         BTREE_ERR_FIXABLE,
905         BTREE_ERR_WANT_RETRY,
906         BTREE_ERR_MUST_RETRY,
907         BTREE_ERR_FATAL,
908 };
909
910 enum btree_validate_ret {
911         BTREE_RETRY_READ = 64,
912 };
913
914 #define btree_err(type, c, b, i, msg, ...)                              \
915 ({                                                                      \
916         char buf[200], *out = buf, *end = out + sizeof(buf);            \
917                                                                         \
918         out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
919         out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__);      \
920                                                                         \
921         if (type == BTREE_ERR_FIXABLE &&                                \
922             write == READ &&                                            \
923             !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
924                 mustfix_fsck_err(c, "%s", buf);                         \
925         } else {                                                        \
926                 bch_err(c, "%s", buf);                                  \
927                                                                         \
928                 switch (type) {                                         \
929                 case BTREE_ERR_FIXABLE:                                 \
930                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
931                         goto fsck_err;                                  \
932                 case BTREE_ERR_WANT_RETRY:                              \
933                         if (have_retry) {                               \
934                                 ret = BTREE_RETRY_READ;                 \
935                                 goto fsck_err;                          \
936                         }                                               \
937                         break;                                          \
938                 case BTREE_ERR_MUST_RETRY:                              \
939                         ret = BTREE_RETRY_READ;                         \
940                         goto fsck_err;                                  \
941                 case BTREE_ERR_FATAL:                                   \
942                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
943                         goto fsck_err;                                  \
944                 }                                                       \
945         }                                                               \
946         true;                                                           \
947 })
948
949 #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
950
951 static int validate_bset(struct bch_fs *c, struct btree *b,
952                          struct bset *i, unsigned sectors,
953                          unsigned *whiteout_u64s, int write,
954                          bool have_retry)
955 {
956         struct bkey_packed *k, *prev = NULL;
957         struct bpos prev_pos = POS_MIN;
958         enum bkey_type type = btree_node_type(b);
959         bool seen_non_whiteout = false;
960         const char *err;
961         int ret = 0;
962
963         if (i == &b->data->keys) {
964                 /* These indicate that we read the wrong btree node: */
965                 btree_err_on(BTREE_NODE_ID(b->data) != b->btree_id,
966                              BTREE_ERR_MUST_RETRY, c, b, i,
967                              "incorrect btree id");
968
969                 btree_err_on(BTREE_NODE_LEVEL(b->data) != b->level,
970                              BTREE_ERR_MUST_RETRY, c, b, i,
971                              "incorrect level");
972
973                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) {
974                         u64 *p = (u64 *) &b->data->ptr;
975
976                         *p = swab64(*p);
977                         bch2_bpos_swab(&b->data->min_key);
978                         bch2_bpos_swab(&b->data->max_key);
979                 }
980
981                 btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
982                              BTREE_ERR_MUST_RETRY, c, b, i,
983                              "incorrect max key");
984
985                 /* XXX: ideally we would be validating min_key too */
986 #if 0
987                 /*
988                  * not correct anymore, due to btree node write error
989                  * handling
990                  *
991                  * need to add b->data->seq to btree keys and verify
992                  * against that
993                  */
994                 btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
995                                                   b->data->ptr),
996                              BTREE_ERR_FATAL, c, b, i,
997                              "incorrect backpointer");
998 #endif
999                 err = bch2_bkey_format_validate(&b->data->format);
1000                 btree_err_on(err,
1001                              BTREE_ERR_FATAL, c, b, i,
1002                              "invalid bkey format: %s", err);
1003         }
1004
1005         if (btree_err_on(le16_to_cpu(i->version) != BCACHE_BSET_VERSION,
1006                          BTREE_ERR_FIXABLE, c, b, i,
1007                          "unsupported bset version")) {
1008                 i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1009                 i->u64s = 0;
1010                 return 0;
1011         }
1012
1013         if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
1014                          BTREE_ERR_FIXABLE, c, b, i,
1015                          "bset past end of btree node")) {
1016                 i->u64s = 0;
1017                 return 0;
1018         }
1019
1020         btree_err_on(b->written && !i->u64s,
1021                      BTREE_ERR_FIXABLE, c, b, i,
1022                      "empty bset");
1023
1024         if (!BSET_SEPARATE_WHITEOUTS(i)) {
1025                 seen_non_whiteout = true;
1026                 *whiteout_u64s = 0;
1027         }
1028
1029         for (k = i->start;
1030              k != vstruct_last(i);) {
1031                 struct bkey_s_c u;
1032                 struct bkey tmp;
1033                 const char *invalid;
1034
1035                 if (btree_err_on(!k->u64s,
1036                                  BTREE_ERR_FIXABLE, c, b, i,
1037                                  "KEY_U64s 0: %zu bytes of metadata lost",
1038                                  vstruct_end(i) - (void *) k)) {
1039                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1040                         break;
1041                 }
1042
1043                 if (btree_err_on(bkey_next(k) > vstruct_last(i),
1044                                  BTREE_ERR_FIXABLE, c, b, i,
1045                                  "key extends past end of bset")) {
1046                         i->u64s = cpu_to_le16((u64 *) k - i->_data);
1047                         break;
1048                 }
1049
1050                 if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
1051                                  BTREE_ERR_FIXABLE, c, b, i,
1052                                  "invalid bkey format %u", k->format)) {
1053                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1054                         memmove_u64s_down(k, bkey_next(k),
1055                                           (u64 *) vstruct_end(i) - (u64 *) k);
1056                         continue;
1057                 }
1058
1059                 if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
1060                         bch2_bkey_swab(type, &b->format, k);
1061
1062                 u = bkey_disassemble(b, k, &tmp);
1063
1064                 invalid = __bch2_bkey_invalid(c, type, u) ?:
1065                         bch2_bkey_in_btree_node(b, u) ?:
1066                         (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
1067                 if (invalid) {
1068                         char buf[160];
1069
1070                         bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
1071                         btree_err(BTREE_ERR_FIXABLE, c, b, i,
1072                                   "invalid bkey %s: %s", buf, invalid);
1073
1074                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1075                         memmove_u64s_down(k, bkey_next(k),
1076                                           (u64 *) vstruct_end(i) - (u64 *) k);
1077                         continue;
1078                 }
1079
1080                 /*
1081                  * with the separate whiteouts thing (used for extents), the
1082                  * second set of keys actually can have whiteouts too, so we
1083                  * can't solely go off bkey_whiteout()...
1084                  */
1085
1086                 if (!seen_non_whiteout &&
1087                     (!bkey_whiteout(k) ||
1088                      (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) {
1089                         *whiteout_u64s = k->_data - i->_data;
1090                         seen_non_whiteout = true;
1091                 } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
1092                         btree_err(BTREE_ERR_FATAL, c, b, i,
1093                                   "keys out of order: %llu:%llu > %llu:%llu",
1094                                   prev_pos.inode,
1095                                   prev_pos.offset,
1096                                   u.k->p.inode,
1097                                   bkey_start_offset(u.k));
1098                         /* XXX: repair this */
1099                 }
1100
1101                 prev_pos = u.k->p;
1102                 prev = k;
1103                 k = bkey_next(k);
1104         }
1105
1106         SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
1107 fsck_err:
1108         return ret;
1109 }
1110
1111 int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
1112 {
1113         struct btree_node_entry *bne;
1114         struct btree_node_iter *iter;
1115         struct btree_node *sorted;
1116         struct bkey_packed *k;
1117         struct bset *i;
1118         bool used_mempool;
1119         unsigned u64s;
1120         int ret, retry_read = 0, write = READ;
1121
1122         iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
1123         __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
1124
1125         if (bch2_meta_read_fault("btree"))
1126                 btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
1127                           "dynamic fault");
1128
1129         btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
1130                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1131                      "bad magic");
1132
1133         btree_err_on(!b->data->keys.seq,
1134                      BTREE_ERR_MUST_RETRY, c, b, NULL,
1135                      "bad btree header");
1136
1137         while (b->written < c->opts.btree_node_size) {
1138                 unsigned sectors, whiteout_u64s = 0;
1139                 struct nonce nonce;
1140                 struct bch_csum csum;
1141
1142                 if (!b->written) {
1143                         i = &b->data->keys;
1144
1145                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1146                                      BTREE_ERR_WANT_RETRY, c, b, i,
1147                                      "unknown checksum type");
1148
1149                         nonce = btree_nonce(i, b->written << 9);
1150                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
1151
1152                         btree_err_on(bch2_crc_cmp(csum, b->data->csum),
1153                                      BTREE_ERR_WANT_RETRY, c, b, i,
1154                                      "invalid checksum");
1155
1156                         bset_encrypt(c, i, b->written << 9);
1157
1158                         sectors = vstruct_sectors(b->data, c->block_bits);
1159
1160                         set_btree_bset(b, b->set, &b->data->keys);
1161                         btree_node_set_format(b, b->data->format);
1162                 } else {
1163                         bne = write_block(b);
1164                         i = &bne->keys;
1165
1166                         if (i->seq != b->data->keys.seq)
1167                                 break;
1168
1169                         btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
1170                                      BTREE_ERR_WANT_RETRY, c, b, i,
1171                                      "unknown checksum type");
1172
1173                         nonce = btree_nonce(i, b->written << 9);
1174                         csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1175
1176                         btree_err_on(bch2_crc_cmp(csum, bne->csum),
1177                                      BTREE_ERR_WANT_RETRY, c, b, i,
1178                                      "invalid checksum");
1179
1180                         bset_encrypt(c, i, b->written << 9);
1181
1182                         sectors = vstruct_sectors(bne, c->block_bits);
1183                 }
1184
1185                 ret = validate_bset(c, b, i, sectors, &whiteout_u64s,
1186                                     READ, have_retry);
1187                 if (ret)
1188                         goto fsck_err;
1189
1190                 b->written += sectors;
1191
1192                 ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
1193                 if (ret < 0) {
1194                         btree_err(BTREE_ERR_FATAL, c, b, i,
1195                                   "insufficient memory");
1196                         goto err;
1197                 }
1198
1199                 if (ret) {
1200                         btree_err_on(!b->written,
1201                                      BTREE_ERR_FIXABLE, c, b, i,
1202                                      "first btree node bset has blacklisted journal seq");
1203                         if (b->written)
1204                                 continue;
1205                 }
1206
1207                 __bch2_btree_node_iter_push(iter, b,
1208                                            i->start,
1209                                            vstruct_idx(i, whiteout_u64s));
1210
1211                 __bch2_btree_node_iter_push(iter, b,
1212                                            vstruct_idx(i, whiteout_u64s),
1213                                            vstruct_last(i));
1214         }
1215
1216         for (bne = write_block(b);
1217              bset_byte_offset(b, bne) < btree_bytes(c);
1218              bne = (void *) bne + block_bytes(c))
1219                 btree_err_on(bne->keys.seq == b->data->keys.seq,
1220                              BTREE_ERR_WANT_RETRY, c, b, NULL,
1221                              "found bset signature after last bset");
1222
1223         sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
1224         sorted->keys.u64s = 0;
1225
1226         b->nr = btree_node_is_extents(b)
1227                 ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
1228                 : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
1229
1230         u64s = le16_to_cpu(sorted->keys.u64s);
1231         *sorted = *b->data;
1232         sorted->keys.u64s = cpu_to_le16(u64s);
1233         swap(sorted, b->data);
1234         set_btree_bset(b, b->set, &b->data->keys);
1235         b->nsets = 1;
1236
1237         BUG_ON(b->nr.live_u64s != u64s);
1238
1239         btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
1240
1241         i = &b->data->keys;
1242         for (k = i->start; k != vstruct_last(i);) {
1243                 enum bkey_type type = btree_node_type(b);
1244                 struct bkey tmp;
1245                 struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
1246                 const char *invalid = bch2_bkey_val_invalid(c, type, u);
1247
1248                 if (invalid) {
1249                         char buf[160];
1250
1251                         bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
1252                         btree_err(BTREE_ERR_FIXABLE, c, b, i,
1253                                   "invalid bkey %s: %s", buf, invalid);
1254
1255                         btree_keys_account_key_drop(&b->nr, 0, k);
1256
1257                         i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
1258                         memmove_u64s_down(k, bkey_next(k),
1259                                           (u64 *) vstruct_end(i) - (u64 *) k);
1260                         continue;
1261                 }
1262
1263                 k = bkey_next(k);
1264         }
1265
1266         bch2_bset_build_aux_tree(b, b->set, false);
1267
1268         set_needs_whiteout(btree_bset_first(b));
1269
1270         btree_node_reset_sib_u64s(b);
1271 out:
1272         mempool_free(iter, &c->fill_iter);
1273         return retry_read;
1274 err:
1275 fsck_err:
1276         if (ret == BTREE_RETRY_READ) {
1277                 retry_read = 1;
1278         } else {
1279                 bch2_inconsistent_error(c);
1280                 set_btree_node_read_error(b);
1281         }
1282         goto out;
1283 }
1284
1285 static void btree_node_read_work(struct work_struct *work)
1286 {
1287         struct btree_read_bio *rb =
1288                 container_of(work, struct btree_read_bio, work);
1289         struct bch_fs *c        = rb->c;
1290         struct btree *b         = rb->bio.bi_private;
1291         struct bio *bio         = &rb->bio;
1292         struct bch_devs_mask avoid;
1293
1294         memset(&avoid, 0, sizeof(avoid));
1295
1296         goto start;
1297         do {
1298                 bch_info(c, "retrying read");
1299                 bio_reset(bio);
1300                 bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1301                 bio->bi_bdev            = rb->pick.ca->disk_sb.bdev;
1302                 bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
1303                 bio->bi_iter.bi_size    = btree_bytes(c);
1304                 submit_bio_wait(bio);
1305 start:
1306                 bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
1307                 percpu_ref_put(&rb->pick.ca->io_ref);
1308
1309                 __set_bit(rb->pick.ca->dev_idx, avoid.d);
1310                 rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
1311
1312                 if (!bio->bi_status &&
1313                     !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
1314                         goto out;
1315         } while (!IS_ERR_OR_NULL(rb->pick.ca));
1316
1317         set_btree_node_read_error(b);
1318 out:
1319         if (!IS_ERR_OR_NULL(rb->pick.ca))
1320                 percpu_ref_put(&rb->pick.ca->io_ref);
1321
1322         bch2_time_stats_update(&c->btree_read_time, rb->start_time);
1323         bio_put(&rb->bio);
1324         clear_btree_node_read_in_flight(b);
1325         wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
1326 }
1327
1328 static void btree_node_read_endio(struct bio *bio)
1329 {
1330         struct btree_read_bio *rb =
1331                 container_of(bio, struct btree_read_bio, bio);
1332
1333         bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
1334
1335         INIT_WORK(&rb->work, btree_node_read_work);
1336         schedule_work(&rb->work);
1337 }
1338
1339 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
1340                           bool sync)
1341 {
1342         struct extent_pick_ptr pick;
1343         struct btree_read_bio *rb;
1344         struct bio *bio;
1345
1346         trace_btree_read(c, b);
1347
1348         pick = bch2_btree_pick_ptr(c, b, NULL);
1349         if (bch2_fs_fatal_err_on(!pick.ca, c,
1350                         "btree node read error: no device to read from")) {
1351                 set_btree_node_read_error(b);
1352                 return;
1353         }
1354
1355         bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
1356         rb = container_of(bio, struct btree_read_bio, bio);
1357         rb->c                   = c;
1358         rb->start_time          = local_clock();
1359         rb->pick                = pick;
1360         bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
1361         bio->bi_bdev            = pick.ca->disk_sb.bdev;
1362         bio->bi_iter.bi_sector  = pick.ptr.offset;
1363         bio->bi_iter.bi_size    = btree_bytes(c);
1364         bch2_bio_map(bio, b->data);
1365
1366         this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
1367                      bio_sectors(bio));
1368
1369         set_btree_node_read_in_flight(b);
1370
1371         if (sync) {
1372                 submit_bio_wait(bio);
1373                 bio->bi_private = b;
1374                 btree_node_read_work(&rb->work);
1375         } else {
1376                 bio->bi_end_io  = btree_node_read_endio;
1377                 bio->bi_private = b;
1378                 submit_bio(bio);
1379         }
1380 }
1381
1382 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
1383                         const struct bkey_i *k, unsigned level)
1384 {
1385         struct closure cl;
1386         struct btree *b;
1387         int ret;
1388
1389         closure_init_stack(&cl);
1390
1391         do {
1392                 ret = bch2_btree_cache_cannibalize_lock(c, &cl);
1393                 closure_sync(&cl);
1394         } while (ret);
1395
1396         b = bch2_btree_node_mem_alloc(c);
1397         bch2_btree_cache_cannibalize_unlock(c);
1398
1399         BUG_ON(IS_ERR(b));
1400
1401         bkey_copy(&b->key, k);
1402         BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
1403
1404         bch2_btree_node_read(c, b, true);
1405
1406         if (btree_node_read_error(b)) {
1407                 bch2_btree_node_hash_remove(&c->btree_cache, b);
1408
1409                 mutex_lock(&c->btree_cache.lock);
1410                 list_move(&b->list, &c->btree_cache.freeable);
1411                 mutex_unlock(&c->btree_cache.lock);
1412
1413                 ret = -EIO;
1414                 goto err;
1415         }
1416
1417         bch2_btree_set_root_for_read(c, b);
1418 err:
1419         six_unlock_write(&b->lock);
1420         six_unlock_intent(&b->lock);
1421
1422         return ret;
1423 }
1424
1425 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
1426                               struct btree_write *w)
1427 {
1428         bch2_journal_pin_drop(&c->journal, &w->journal);
1429         closure_wake_up(&w->wait);
1430 }
1431
1432 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
1433 {
1434         struct btree_write *w = btree_prev_write(b);
1435
1436         bch2_btree_complete_write(c, b, w);
1437         btree_node_io_unlock(b);
1438 }
1439
1440 static void bch2_btree_node_write_error(struct bch_fs *c,
1441                                         struct btree_write_bio *wbio)
1442 {
1443         struct btree *b         = wbio->wbio.bio.bi_private;
1444         struct closure *cl      = wbio->cl;
1445         __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
1446         struct bkey_i_extent *new_key;
1447         struct bkey_s_extent e;
1448         struct bch_extent_ptr *ptr;
1449         struct btree_iter iter;
1450         int ret;
1451
1452         __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
1453                                BTREE_MAX_DEPTH,
1454                                b->level, 0);
1455 retry:
1456         ret = bch2_btree_iter_traverse(&iter);
1457         if (ret)
1458                 goto err;
1459
1460         /* has node been freed? */
1461         if (iter.nodes[b->level] != b) {
1462                 /* node has been freed: */
1463                 if (!btree_node_dying(b))
1464                         panic("foo4\n");
1465                 goto out;
1466         }
1467
1468         if (!btree_node_hashed(b))
1469                 panic("foo5\n");
1470
1471         bkey_copy(&tmp.k, &b->key);
1472
1473         new_key = bkey_i_to_extent(&tmp.k);
1474         e = extent_i_to_s(new_key);
1475         extent_for_each_ptr_backwards(e, ptr)
1476                 if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
1477                         bch2_extent_drop_ptr(e, ptr);
1478
1479         if (!bch2_extent_nr_ptrs(e.c))
1480                 goto err;
1481
1482         ret = bch2_btree_node_update_key(c, &iter, b, new_key);
1483         if (ret == -EINTR)
1484                 goto retry;
1485         if (ret)
1486                 goto err;
1487 out:
1488         bch2_btree_iter_unlock(&iter);
1489         bio_put(&wbio->wbio.bio);
1490         btree_node_write_done(c, b);
1491         if (cl)
1492                 closure_put(cl);
1493         return;
1494 err:
1495         set_btree_node_noevict(b);
1496         bch2_fs_fatal_error(c, "fatal error writing btree node");
1497         goto out;
1498 }
1499
1500 void bch2_btree_write_error_work(struct work_struct *work)
1501 {
1502         struct bch_fs *c = container_of(work, struct bch_fs,
1503                                         btree_write_error_work);
1504         struct bio *bio;
1505
1506         while (1) {
1507                 spin_lock_irq(&c->btree_write_error_lock);
1508                 bio = bio_list_pop(&c->btree_write_error_list);
1509                 spin_unlock_irq(&c->btree_write_error_lock);
1510
1511                 if (!bio)
1512                         break;
1513
1514                 bch2_btree_node_write_error(c,
1515                         container_of(bio, struct btree_write_bio, wbio.bio));
1516         }
1517 }
1518
1519 static void btree_node_write_work(struct work_struct *work)
1520 {
1521         struct btree_write_bio *wbio =
1522                 container_of(work, struct btree_write_bio, work);
1523         struct closure *cl      = wbio->cl;
1524         struct bch_fs *c        = wbio->wbio.c;
1525         struct btree *b         = wbio->wbio.bio.bi_private;
1526
1527         btree_bounce_free(c,
1528                 wbio->wbio.order,
1529                 wbio->wbio.used_mempool,
1530                 wbio->data);
1531
1532         if (wbio->wbio.failed.nr) {
1533                 unsigned long flags;
1534
1535                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1536                 bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
1537                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1538
1539                 queue_work(c->wq, &c->btree_write_error_work);
1540                 return;
1541         }
1542
1543         bio_put(&wbio->wbio.bio);
1544         btree_node_write_done(c, b);
1545         if (cl)
1546                 closure_put(cl);
1547 }
1548
1549 static void btree_node_write_endio(struct bio *bio)
1550 {
1551         struct bch_write_bio *wbio      = to_wbio(bio);
1552         struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
1553         struct bch_write_bio *orig      = parent ?: wbio;
1554         struct bch_fs *c                = wbio->c;
1555         struct bch_dev *ca              = wbio->ca;
1556         unsigned long flags;
1557
1558         bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
1559
1560         if (bio->bi_status == BLK_STS_REMOVED ||
1561             bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
1562             bch2_meta_write_fault("btree")) {
1563                 spin_lock_irqsave(&c->btree_write_error_lock, flags);
1564                 bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
1565                 spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
1566         }
1567
1568         if (wbio->have_io_ref)
1569                 percpu_ref_put(&ca->io_ref);
1570
1571         if (parent) {
1572                 bio_put(bio);
1573                 bio_endio(&parent->bio);
1574         } else {
1575                 struct btree_write_bio *wb =
1576                         container_of(orig, struct btree_write_bio, wbio);
1577
1578                 INIT_WORK(&wb->work, btree_node_write_work);
1579                 schedule_work(&wb->work);
1580         }
1581 }
1582
1583 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
1584                                    struct bset *i, unsigned sectors)
1585 {
1586         const struct bch_extent_ptr *ptr;
1587         unsigned whiteout_u64s = 0;
1588         int ret;
1589
1590         extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
1591                 break;
1592
1593         ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE, false);
1594         if (ret)
1595                 bch2_inconsistent_error(c);
1596
1597         return ret;
1598 }
1599
1600 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1601                             struct closure *parent,
1602                             enum six_lock_type lock_type_held)
1603 {
1604         struct btree_write_bio *wbio;
1605         struct bset_tree *t;
1606         struct bset *i;
1607         struct btree_node *bn = NULL;
1608         struct btree_node_entry *bne = NULL;
1609         BKEY_PADDED(key) k;
1610         struct bkey_s_extent e;
1611         struct bch_extent_ptr *ptr;
1612         struct sort_iter sort_iter;
1613         struct nonce nonce;
1614         unsigned bytes_to_write, sectors_to_write, order, bytes, u64s;
1615         u64 seq = 0;
1616         bool used_mempool;
1617         unsigned long old, new;
1618         void *data;
1619
1620         if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
1621                 return;
1622
1623         /*
1624          * We may only have a read lock on the btree node - the dirty bit is our
1625          * "lock" against racing with other threads that may be trying to start
1626          * a write, we do a write iff we clear the dirty bit. Since setting the
1627          * dirty bit requires a write lock, we can't race with other threads
1628          * redirtying it:
1629          */
1630         do {
1631                 old = new = READ_ONCE(b->flags);
1632
1633                 if (!(old & (1 << BTREE_NODE_dirty)))
1634                         return;
1635
1636                 if (b->written &&
1637                     !btree_node_may_write(b))
1638                         return;
1639
1640                 if (old & (1 << BTREE_NODE_write_in_flight)) {
1641                         btree_node_wait_on_io(b);
1642                         continue;
1643                 }
1644
1645                 new &= ~(1 << BTREE_NODE_dirty);
1646                 new &= ~(1 << BTREE_NODE_need_write);
1647                 new |=  (1 << BTREE_NODE_write_in_flight);
1648                 new |=  (1 << BTREE_NODE_just_written);
1649                 new ^=  (1 << BTREE_NODE_write_idx);
1650         } while (cmpxchg_acquire(&b->flags, old, new) != old);
1651
1652         BUG_ON(btree_node_fake(b));
1653         BUG_ON(!list_empty(&b->write_blocked));
1654         BUG_ON((b->will_make_reachable != NULL) != !b->written);
1655
1656         BUG_ON(b->written >= c->opts.btree_node_size);
1657         BUG_ON(bset_written(b, btree_bset_last(b)));
1658         BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
1659         BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
1660
1661         if (lock_type_held == SIX_LOCK_intent) {
1662                 six_lock_write(&b->lock);
1663                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
1664                 six_unlock_write(&b->lock);
1665         } else {
1666                 __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK);
1667         }
1668
1669         BUG_ON(b->uncompacted_whiteout_u64s);
1670
1671         sort_iter_init(&sort_iter, b);
1672
1673         bytes = !b->written
1674                 ? sizeof(struct btree_node)
1675                 : sizeof(struct btree_node_entry);
1676
1677         bytes += b->whiteout_u64s * sizeof(u64);
1678
1679         for_each_bset(b, t) {
1680                 i = bset(b, t);
1681
1682                 if (bset_written(b, i))
1683                         continue;
1684
1685                 bytes += le16_to_cpu(i->u64s) * sizeof(u64);
1686                 sort_iter_add(&sort_iter,
1687                               btree_bkey_first(b, t),
1688                               btree_bkey_last(b, t));
1689                 seq = max(seq, le64_to_cpu(i->journal_seq));
1690         }
1691
1692         order = get_order(bytes);
1693         data = btree_bounce_alloc(c, order, &used_mempool);
1694
1695         if (!b->written) {
1696                 bn = data;
1697                 *bn = *b->data;
1698                 i = &bn->keys;
1699         } else {
1700                 bne = data;
1701                 bne->keys = b->data->keys;
1702                 i = &bne->keys;
1703         }
1704
1705         i->journal_seq  = cpu_to_le64(seq);
1706         i->u64s         = 0;
1707
1708         if (!btree_node_is_extents(b)) {
1709                 sort_iter_add(&sort_iter,
1710                               unwritten_whiteouts_start(c, b),
1711                               unwritten_whiteouts_end(c, b));
1712                 SET_BSET_SEPARATE_WHITEOUTS(i, false);
1713         } else {
1714                 memcpy_u64s(i->start,
1715                             unwritten_whiteouts_start(c, b),
1716                             b->whiteout_u64s);
1717                 i->u64s = cpu_to_le16(b->whiteout_u64s);
1718                 SET_BSET_SEPARATE_WHITEOUTS(i, true);
1719         }
1720
1721         b->whiteout_u64s = 0;
1722
1723         u64s = btree_node_is_extents(b)
1724                 ? sort_extents(vstruct_last(i), &sort_iter, false)
1725                 : sort_keys(i->start, &sort_iter, false);
1726         le16_add_cpu(&i->u64s, u64s);
1727
1728         clear_needs_whiteout(i);
1729
1730         /* do we have data to write? */
1731         if (b->written && !i->u64s)
1732                 goto nowrite;
1733
1734         bytes_to_write = vstruct_end(i) - data;
1735         sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
1736
1737         memset(data + bytes_to_write, 0,
1738                (sectors_to_write << 9) - bytes_to_write);
1739
1740         BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
1741         BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
1742         BUG_ON(i->seq != b->data->keys.seq);
1743
1744         i->version = cpu_to_le16(BCACHE_BSET_VERSION);
1745         SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
1746
1747         /* if we're going to be encrypting, check metadata validity first: */
1748         if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1749             validate_bset_for_write(c, b, i, sectors_to_write))
1750                 goto err;
1751
1752         bset_encrypt(c, i, b->written << 9);
1753
1754         nonce = btree_nonce(i, b->written << 9);
1755
1756         if (bn)
1757                 bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
1758         else
1759                 bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
1760
1761         /* if we're not encrypting, check metadata after checksumming: */
1762         if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
1763             validate_bset_for_write(c, b, i, sectors_to_write))
1764                 goto err;
1765
1766         /*
1767          * We handle btree write errors by immediately halting the journal -
1768          * after we've done that, we can't issue any subsequent btree writes
1769          * because they might have pointers to new nodes that failed to write.
1770          *
1771          * Furthermore, there's no point in doing any more btree writes because
1772          * with the journal stopped, we're never going to update the journal to
1773          * reflect that those writes were done and the data flushed from the
1774          * journal:
1775          *
1776          * Make sure to update b->written so bch2_btree_init_next() doesn't
1777          * break:
1778          */
1779         if (bch2_journal_error(&c->journal) ||
1780             c->opts.nochanges)
1781                 goto err;
1782
1783         trace_btree_write(b, bytes_to_write, sectors_to_write);
1784
1785         wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
1786                             struct btree_write_bio, wbio.bio);
1787         wbio_init(&wbio->wbio.bio);
1788         wbio->data                      = data;
1789         wbio->cl                        = parent;
1790         wbio->wbio.order                = order;
1791         wbio->wbio.used_mempool         = used_mempool;
1792         wbio->wbio.bio.bi_opf           = REQ_OP_WRITE|REQ_META|REQ_FUA;
1793         wbio->wbio.bio.bi_iter.bi_size  = sectors_to_write << 9;
1794         wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
1795         wbio->wbio.bio.bi_private       = b;
1796
1797         if (parent)
1798                 closure_get(parent);
1799
1800         bch2_bio_map(&wbio->wbio.bio, data);
1801
1802         /*
1803          * If we're appending to a leaf node, we don't technically need FUA -
1804          * this write just needs to be persisted before the next journal write,
1805          * which will be marked FLUSH|FUA.
1806          *
1807          * Similarly if we're writing a new btree root - the pointer is going to
1808          * be in the next journal entry.
1809          *
1810          * But if we're writing a new btree node (that isn't a root) or
1811          * appending to a non leaf btree node, we need either FUA or a flush
1812          * when we write the parent with the new pointer. FUA is cheaper than a
1813          * flush, and writes appending to leaf nodes aren't blocking anything so
1814          * just make all btree node writes FUA to keep things sane.
1815          */
1816
1817         bkey_copy(&k.key, &b->key);
1818         e = bkey_i_to_s_extent(&k.key);
1819
1820         extent_for_each_ptr(e, ptr)
1821                 ptr->offset += b->written;
1822
1823         b->written += sectors_to_write;
1824
1825         bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
1826         return;
1827 err:
1828         set_btree_node_noevict(b);
1829         b->written += sectors_to_write;
1830 nowrite:
1831         btree_bounce_free(c, order, used_mempool, data);
1832         btree_node_write_done(c, b);
1833 }
1834
1835 /*
1836  * Work that must be done with write lock held:
1837  */
1838 bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
1839 {
1840         bool invalidated_iter = false;
1841         struct btree_node_entry *bne;
1842         struct bset_tree *t;
1843
1844         if (!btree_node_just_written(b))
1845                 return false;
1846
1847         BUG_ON(b->whiteout_u64s);
1848         BUG_ON(b->uncompacted_whiteout_u64s);
1849
1850         clear_btree_node_just_written(b);
1851
1852         /*
1853          * Note: immediately after write, bset_unwritten()/bset_written() don't
1854          * work - the amount of data we had to write after compaction might have
1855          * been smaller than the offset of the last bset.
1856          *
1857          * However, we know that all bsets have been written here, as long as
1858          * we're still holding the write lock:
1859          */
1860
1861         /*
1862          * XXX: decide if we really want to unconditionally sort down to a
1863          * single bset:
1864          */
1865         if (b->nsets > 1) {
1866                 btree_node_sort(c, b, NULL, 0, b->nsets, true);
1867                 invalidated_iter = true;
1868         } else {
1869                 invalidated_iter = bch2_drop_whiteouts(b);
1870         }
1871
1872         for_each_bset(b, t)
1873                 set_needs_whiteout(bset(b, t));
1874
1875         bch2_btree_verify(c, b);
1876
1877         /*
1878          * If later we don't unconditionally sort down to a single bset, we have
1879          * to ensure this is still true:
1880          */
1881         BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
1882
1883         bne = want_new_bset(c, b);
1884         if (bne)
1885                 bch2_bset_init_next(b, &bne->keys);
1886
1887         bch2_btree_build_aux_trees(b);
1888
1889         return invalidated_iter;
1890 }
1891
1892 /*
1893  * Use this one if the node is intent locked:
1894  */
1895 void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
1896                           struct closure *parent,
1897                           enum six_lock_type lock_type_held)
1898 {
1899         BUG_ON(lock_type_held == SIX_LOCK_write);
1900
1901         if (lock_type_held == SIX_LOCK_intent ||
1902             six_trylock_convert(&b->lock, SIX_LOCK_read,
1903                                 SIX_LOCK_intent)) {
1904                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
1905
1906                 /* don't cycle lock unnecessarily: */
1907                 if (btree_node_just_written(b)) {
1908                         six_lock_write(&b->lock);
1909                         bch2_btree_post_write_cleanup(c, b);
1910                         six_unlock_write(&b->lock);
1911                 }
1912
1913                 if (lock_type_held == SIX_LOCK_read)
1914                         six_lock_downgrade(&b->lock);
1915         } else {
1916                 __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
1917         }
1918 }
1919
1920 void bch2_btree_verify_flushed(struct bch_fs *c)
1921 {
1922         struct bucket_table *tbl;
1923         struct rhash_head *pos;
1924         struct btree *b;
1925         unsigned i;
1926
1927         rcu_read_lock();
1928         for_each_cached_btree(b, c, tbl, i, pos)
1929                 BUG_ON(btree_node_dirty(b));
1930         rcu_read_unlock();
1931 }