]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c
Update bcachefs sources to 24c6361e20 bcachefs: Fix a trans path overflow in bch2_btr...
[bcachefs-tools-debian] / libbcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "buckets.h"
8 #include "checksum.h"
9 #include "disk_groups.h"
10 #include "error.h"
11 #include "io.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17
18 #include <trace/events/bcachefs.h>
19
20 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
21 {
22         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
23 }
24
25 static void __journal_replay_free(struct bch_fs *c,
26                                   struct journal_replay *i)
27 {
28         struct journal_replay **p =
29                 genradix_ptr(&c->journal_entries,
30                              journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
31
32         BUG_ON(*p != i);
33         *p = NULL;
34         kvpfree(i, offsetof(struct journal_replay, j) +
35                 vstruct_bytes(&i->j));
36 }
37
38 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
39 {
40         i->ignore = true;
41
42         if (!c->opts.read_entire_journal)
43                 __journal_replay_free(c, i);
44 }
45
46 struct journal_list {
47         struct closure          cl;
48         u64                     last_seq;
49         struct mutex            lock;
50         int                     ret;
51 };
52
53 #define JOURNAL_ENTRY_ADD_OK            0
54 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
55
56 /*
57  * Given a journal entry we just read, add it to the list of journal entries to
58  * be replayed:
59  */
60 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
61                              struct journal_ptr entry_ptr,
62                              struct journal_list *jlist, struct jset *j,
63                              bool bad)
64 {
65         struct genradix_iter iter;
66         struct journal_replay **_i, *i, *dup;
67         struct journal_ptr *ptr;
68         size_t bytes = vstruct_bytes(j);
69         u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
70         int ret = JOURNAL_ENTRY_ADD_OK;
71
72         /* Is this entry older than the range we need? */
73         if (!c->opts.read_entire_journal &&
74             le64_to_cpu(j->seq) < jlist->last_seq)
75                 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
76
77         /*
78          * genradixes are indexed by a ulong, not a u64, so we can't index them
79          * by sequence number directly: Assume instead that they will all fall
80          * within the range of +-2billion of the filrst one we find.
81          */
82         if (!c->journal_entries_base_seq)
83                 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
84
85         /* Drop entries we don't need anymore */
86         if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
87                 genradix_for_each_from(&c->journal_entries, iter, _i,
88                                        journal_entry_radix_idx(c, jlist->last_seq)) {
89                         i = *_i;
90
91                         if (!i || i->ignore)
92                                 continue;
93
94                         if (le64_to_cpu(i->j.seq) >= last_seq)
95                                 break;
96                         journal_replay_free(c, i);
97                 }
98         }
99
100         jlist->last_seq = max(jlist->last_seq, last_seq);
101
102         _i = genradix_ptr_alloc(&c->journal_entries,
103                                 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
104                                 GFP_KERNEL);
105         if (!_i)
106                 return -ENOMEM;
107
108         /*
109          * Duplicate journal entries? If so we want the one that didn't have a
110          * checksum error:
111          */
112         dup = *_i;
113         if (dup) {
114                 if (dup->bad) {
115                         /* we'll replace @dup: */
116                 } else if (bad) {
117                         i = dup;
118                         goto found;
119                 } else {
120                         fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
121                                     memcmp(j, &dup->j, bytes), c,
122                                     "found duplicate but non identical journal entries (seq %llu)",
123                                     le64_to_cpu(j->seq));
124                         i = dup;
125                         goto found;
126                 }
127         }
128
129         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
130         if (!i)
131                 return -ENOMEM;
132
133         i->nr_ptrs       = 0;
134         i->bad          = bad;
135         i->ignore       = false;
136         memcpy(&i->j, j, bytes);
137
138         if (dup) {
139                 i->nr_ptrs = dup->nr_ptrs;
140                 memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
141                 __journal_replay_free(c, dup);
142         }
143
144
145         *_i = i;
146 found:
147         for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
148                 if (ptr->dev == ca->dev_idx) {
149                         bch_err(c, "duplicate journal entry %llu on same device",
150                                 le64_to_cpu(i->j.seq));
151                         goto out;
152                 }
153         }
154
155         if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
156                 bch_err(c, "found too many copies of journal entry %llu",
157                         le64_to_cpu(i->j.seq));
158                 goto out;
159         }
160
161         i->ptrs[i->nr_ptrs++] = entry_ptr;
162 out:
163 fsck_err:
164         return ret;
165 }
166
167 static struct nonce journal_nonce(const struct jset *jset)
168 {
169         return (struct nonce) {{
170                 [0] = 0,
171                 [1] = ((__le32 *) &jset->seq)[0],
172                 [2] = ((__le32 *) &jset->seq)[1],
173                 [3] = BCH_NONCE_JOURNAL,
174         }};
175 }
176
177 /* this fills in a range with empty jset_entries: */
178 static void journal_entry_null_range(void *start, void *end)
179 {
180         struct jset_entry *entry;
181
182         for (entry = start; entry != end; entry = vstruct_next(entry))
183                 memset(entry, 0, sizeof(*entry));
184 }
185
186 #define JOURNAL_ENTRY_REREAD    5
187 #define JOURNAL_ENTRY_NONE      6
188 #define JOURNAL_ENTRY_BAD       7
189
190 static void journal_entry_err_msg(struct printbuf *out,
191                                   struct jset *jset,
192                                   struct jset_entry *entry)
193 {
194         prt_str(out, "invalid journal entry ");
195         if (entry)
196                 prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
197
198         if (!jset)
199                 prt_printf(out, "in superblock");
200         else if (!entry)
201                 prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
202         else
203                 prt_printf(out, "at offset %zi/%u seq %llu",
204                            (u64 *) entry - jset->_data,
205                            le32_to_cpu(jset->u64s),
206                            le64_to_cpu(jset->seq));
207         prt_str(out, ": ");
208 }
209
210 #define journal_entry_err(c, jset, entry, msg, ...)                     \
211 ({                                                                      \
212         struct printbuf buf = PRINTBUF;                                 \
213                                                                         \
214         journal_entry_err_msg(&buf, jset, entry);                       \
215         prt_printf(&buf, msg, ##__VA_ARGS__);                           \
216                                                                         \
217         switch (write) {                                                \
218         case READ:                                                      \
219                 mustfix_fsck_err(c, "%s", buf.buf);                     \
220                 break;                                                  \
221         case WRITE:                                                     \
222                 bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
223                 if (bch2_fs_inconsistent(c)) {                          \
224                         ret = -BCH_ERR_fsck_errors_not_fixed;           \
225                         goto fsck_err;                                  \
226                 }                                                       \
227                 break;                                                  \
228         }                                                               \
229                                                                         \
230         printbuf_exit(&buf);                                            \
231         true;                                                           \
232 })
233
234 #define journal_entry_err_on(cond, c, jset, entry, msg, ...)            \
235         ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
236
237 #define FSCK_DELETED_KEY        5
238
239 static int journal_validate_key(struct bch_fs *c,
240                                 struct jset *jset,
241                                 struct jset_entry *entry,
242                                 unsigned level, enum btree_id btree_id,
243                                 struct bkey_i *k,
244                                 unsigned version, int big_endian, int write)
245 {
246         void *next = vstruct_next(entry);
247         struct printbuf buf = PRINTBUF;
248         int ret = 0;
249
250         if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
251                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
252                 journal_entry_null_range(vstruct_next(entry), next);
253                 return FSCK_DELETED_KEY;
254         }
255
256         if (journal_entry_err_on((void *) bkey_next(k) >
257                                  (void *) vstruct_next(entry),
258                                  c, jset, entry,
259                                  "extends past end of journal entry")) {
260                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
261                 journal_entry_null_range(vstruct_next(entry), next);
262                 return FSCK_DELETED_KEY;
263         }
264
265         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
266                                  c, jset, entry,
267                                  "bad format %u", k->k.format)) {
268                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
269                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
270                 journal_entry_null_range(vstruct_next(entry), next);
271                 return FSCK_DELETED_KEY;
272         }
273
274         if (!write)
275                 bch2_bkey_compat(level, btree_id, version, big_endian,
276                                  write, NULL, bkey_to_packed(k));
277
278         if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
279                               __btree_node_type(level, btree_id), write, &buf)) {
280                 printbuf_reset(&buf);
281                 prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
282                            bch2_jset_entry_types[entry->type],
283                            (u64 *) entry - jset->_data,
284                            le32_to_cpu(jset->u64s),
285                            le64_to_cpu(jset->seq));
286                 prt_newline(&buf);
287                 printbuf_indent_add(&buf, 2);
288
289                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
290                 prt_newline(&buf);
291                 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
292                                   __btree_node_type(level, btree_id), write, &buf);
293
294                 mustfix_fsck_err(c, "%s", buf.buf);
295
296                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
297                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
298                 journal_entry_null_range(vstruct_next(entry), next);
299
300                 printbuf_exit(&buf);
301                 return FSCK_DELETED_KEY;
302         }
303
304         if (write)
305                 bch2_bkey_compat(level, btree_id, version, big_endian,
306                                  write, NULL, bkey_to_packed(k));
307 fsck_err:
308         printbuf_exit(&buf);
309         return ret;
310 }
311
312 static int journal_entry_btree_keys_validate(struct bch_fs *c,
313                                              struct jset *jset,
314                                              struct jset_entry *entry,
315                                              unsigned version, int big_endian, int write)
316 {
317         struct bkey_i *k = entry->start;
318
319         while (k != vstruct_last(entry)) {
320                 int ret = journal_validate_key(c, jset, entry,
321                                                entry->level,
322                                                entry->btree_id,
323                                                k, version, big_endian, write);
324                 if (ret == FSCK_DELETED_KEY)
325                         continue;
326
327                 k = bkey_next(k);
328         }
329
330         return 0;
331 }
332
333 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
334                                              struct jset_entry *entry)
335 {
336         struct bkey_i *k;
337         bool first = true;
338
339         vstruct_for_each(entry, k) {
340                 if (!first) {
341                         prt_newline(out);
342                         prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
343                 }
344                 prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
345                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
346                 first = false;
347         }
348 }
349
350 static int journal_entry_btree_root_validate(struct bch_fs *c,
351                                              struct jset *jset,
352                                              struct jset_entry *entry,
353                                              unsigned version, int big_endian, int write)
354 {
355         struct bkey_i *k = entry->start;
356         int ret = 0;
357
358         if (journal_entry_err_on(!entry->u64s ||
359                                  le16_to_cpu(entry->u64s) != k->k.u64s,
360                                  c, jset, entry,
361                                  "invalid btree root journal entry: wrong number of keys")) {
362                 void *next = vstruct_next(entry);
363                 /*
364                  * we don't want to null out this jset_entry,
365                  * just the contents, so that later we can tell
366                  * we were _supposed_ to have a btree root
367                  */
368                 entry->u64s = 0;
369                 journal_entry_null_range(vstruct_next(entry), next);
370                 return 0;
371         }
372
373         return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
374                                     version, big_endian, write);
375 fsck_err:
376         return ret;
377 }
378
379 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
380                                              struct jset_entry *entry)
381 {
382         journal_entry_btree_keys_to_text(out, c, entry);
383 }
384
385 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
386                                             struct jset *jset,
387                                             struct jset_entry *entry,
388                                             unsigned version, int big_endian, int write)
389 {
390         /* obsolete, don't care: */
391         return 0;
392 }
393
394 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
395                                             struct jset_entry *entry)
396 {
397 }
398
399 static int journal_entry_blacklist_validate(struct bch_fs *c,
400                                             struct jset *jset,
401                                             struct jset_entry *entry,
402                                             unsigned version, int big_endian, int write)
403 {
404         int ret = 0;
405
406         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
407                                  c, jset, entry,
408                 "invalid journal seq blacklist entry: bad size")) {
409                 journal_entry_null_range(entry, vstruct_next(entry));
410         }
411 fsck_err:
412         return ret;
413 }
414
415 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
416                                             struct jset_entry *entry)
417 {
418         struct jset_entry_blacklist *bl =
419                 container_of(entry, struct jset_entry_blacklist, entry);
420
421         prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
422 }
423
424 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
425                                                struct jset *jset,
426                                                struct jset_entry *entry,
427                                                unsigned version, int big_endian, int write)
428 {
429         struct jset_entry_blacklist_v2 *bl_entry;
430         int ret = 0;
431
432         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
433                                  c, jset, entry,
434                 "invalid journal seq blacklist entry: bad size")) {
435                 journal_entry_null_range(entry, vstruct_next(entry));
436                 goto out;
437         }
438
439         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
440
441         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
442                                  le64_to_cpu(bl_entry->end),
443                                  c, jset, entry,
444                 "invalid journal seq blacklist entry: start > end")) {
445                 journal_entry_null_range(entry, vstruct_next(entry));
446         }
447 out:
448 fsck_err:
449         return ret;
450 }
451
452 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
453                                                struct jset_entry *entry)
454 {
455         struct jset_entry_blacklist_v2 *bl =
456                 container_of(entry, struct jset_entry_blacklist_v2, entry);
457
458         prt_printf(out, "start=%llu end=%llu",
459                le64_to_cpu(bl->start),
460                le64_to_cpu(bl->end));
461 }
462
463 static int journal_entry_usage_validate(struct bch_fs *c,
464                                         struct jset *jset,
465                                         struct jset_entry *entry,
466                                         unsigned version, int big_endian, int write)
467 {
468         struct jset_entry_usage *u =
469                 container_of(entry, struct jset_entry_usage, entry);
470         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
471         int ret = 0;
472
473         if (journal_entry_err_on(bytes < sizeof(*u),
474                                  c, jset, entry,
475                                  "invalid journal entry usage: bad size")) {
476                 journal_entry_null_range(entry, vstruct_next(entry));
477                 return ret;
478         }
479
480 fsck_err:
481         return ret;
482 }
483
484 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
485                                         struct jset_entry *entry)
486 {
487         struct jset_entry_usage *u =
488                 container_of(entry, struct jset_entry_usage, entry);
489
490         prt_printf(out, "type=%s v=%llu",
491                bch2_fs_usage_types[u->entry.btree_id],
492                le64_to_cpu(u->v));
493 }
494
495 static int journal_entry_data_usage_validate(struct bch_fs *c,
496                                         struct jset *jset,
497                                         struct jset_entry *entry,
498                                         unsigned version, int big_endian, int write)
499 {
500         struct jset_entry_data_usage *u =
501                 container_of(entry, struct jset_entry_data_usage, entry);
502         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
503         int ret = 0;
504
505         if (journal_entry_err_on(bytes < sizeof(*u) ||
506                                  bytes < sizeof(*u) + u->r.nr_devs,
507                                  c, jset, entry,
508                                  "invalid journal entry usage: bad size")) {
509                 journal_entry_null_range(entry, vstruct_next(entry));
510                 return ret;
511         }
512
513 fsck_err:
514         return ret;
515 }
516
517 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
518                                              struct jset_entry *entry)
519 {
520         struct jset_entry_data_usage *u =
521                 container_of(entry, struct jset_entry_data_usage, entry);
522
523         bch2_replicas_entry_to_text(out, &u->r);
524         prt_printf(out, "=%llu", le64_to_cpu(u->v));
525 }
526
527 static int journal_entry_clock_validate(struct bch_fs *c,
528                                         struct jset *jset,
529                                         struct jset_entry *entry,
530                                         unsigned version, int big_endian, int write)
531 {
532         struct jset_entry_clock *clock =
533                 container_of(entry, struct jset_entry_clock, entry);
534         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
535         int ret = 0;
536
537         if (journal_entry_err_on(bytes != sizeof(*clock),
538                                  c, jset, entry, "bad size")) {
539                 journal_entry_null_range(entry, vstruct_next(entry));
540                 return ret;
541         }
542
543         if (journal_entry_err_on(clock->rw > 1,
544                                  c, jset, entry, "bad rw")) {
545                 journal_entry_null_range(entry, vstruct_next(entry));
546                 return ret;
547         }
548
549 fsck_err:
550         return ret;
551 }
552
553 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
554                                         struct jset_entry *entry)
555 {
556         struct jset_entry_clock *clock =
557                 container_of(entry, struct jset_entry_clock, entry);
558
559         prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
560 }
561
562 static int journal_entry_dev_usage_validate(struct bch_fs *c,
563                                             struct jset *jset,
564                                             struct jset_entry *entry,
565                                             unsigned version, int big_endian, int write)
566 {
567         struct jset_entry_dev_usage *u =
568                 container_of(entry, struct jset_entry_dev_usage, entry);
569         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
570         unsigned expected = sizeof(*u);
571         unsigned dev;
572         int ret = 0;
573
574         if (journal_entry_err_on(bytes < expected,
575                                  c, jset, entry, "bad size (%u < %u)",
576                                  bytes, expected)) {
577                 journal_entry_null_range(entry, vstruct_next(entry));
578                 return ret;
579         }
580
581         dev = le32_to_cpu(u->dev);
582
583         if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
584                                  c, jset, entry, "bad dev")) {
585                 journal_entry_null_range(entry, vstruct_next(entry));
586                 return ret;
587         }
588
589         if (journal_entry_err_on(u->pad,
590                                  c, jset, entry, "bad pad")) {
591                 journal_entry_null_range(entry, vstruct_next(entry));
592                 return ret;
593         }
594
595 fsck_err:
596         return ret;
597 }
598
599 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
600                                             struct jset_entry *entry)
601 {
602         struct jset_entry_dev_usage *u =
603                 container_of(entry, struct jset_entry_dev_usage, entry);
604         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
605
606         prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
607
608         for (i = 0; i < nr_types; i++) {
609                 if (i < BCH_DATA_NR)
610                         prt_printf(out, " %s", bch2_data_types[i]);
611                 else
612                         prt_printf(out, " (unknown data type %u)", i);
613                 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
614                        le64_to_cpu(u->d[i].buckets),
615                        le64_to_cpu(u->d[i].sectors),
616                        le64_to_cpu(u->d[i].fragmented));
617         }
618
619         prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
620 }
621
622 static int journal_entry_log_validate(struct bch_fs *c,
623                                       struct jset *jset,
624                                       struct jset_entry *entry,
625                                       unsigned version, int big_endian, int write)
626 {
627         return 0;
628 }
629
630 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
631                                       struct jset_entry *entry)
632 {
633         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
634         unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
635
636         prt_printf(out, "%.*s", bytes, l->d);
637 }
638
639 static int journal_entry_overwrite_validate(struct bch_fs *c,
640                                       struct jset *jset,
641                                       struct jset_entry *entry,
642                                       unsigned version, int big_endian, int write)
643 {
644         return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
645 }
646
647 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
648                                             struct jset_entry *entry)
649 {
650         journal_entry_btree_keys_to_text(out, c, entry);
651 }
652
653 struct jset_entry_ops {
654         int (*validate)(struct bch_fs *, struct jset *,
655                         struct jset_entry *, unsigned, int, int);
656         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
657 };
658
659 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
660 #define x(f, nr)                                                \
661         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
662                 .validate       = journal_entry_##f##_validate, \
663                 .to_text        = journal_entry_##f##_to_text,  \
664         },
665         BCH_JSET_ENTRY_TYPES()
666 #undef x
667 };
668
669 int bch2_journal_entry_validate(struct bch_fs *c,
670                                 struct jset *jset,
671                                 struct jset_entry *entry,
672                                 unsigned version, int big_endian, int write)
673 {
674         return entry->type < BCH_JSET_ENTRY_NR
675                 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
676                                 version, big_endian, write)
677                 : 0;
678 }
679
680 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
681                                 struct jset_entry *entry)
682 {
683         if (entry->type < BCH_JSET_ENTRY_NR) {
684                 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
685                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
686         } else {
687                 prt_printf(out, "(unknown type %u)", entry->type);
688         }
689 }
690
691 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
692                                  int write)
693 {
694         struct jset_entry *entry;
695         int ret = 0;
696
697         vstruct_for_each(jset, entry) {
698                 if (journal_entry_err_on(vstruct_next(entry) >
699                                          vstruct_last(jset), c, jset, entry,
700                                 "journal entry extends past end of jset")) {
701                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
702                         break;
703                 }
704
705                 ret = bch2_journal_entry_validate(c, jset, entry,
706                                         le32_to_cpu(jset->version),
707                                         JSET_BIG_ENDIAN(jset), write);
708                 if (ret)
709                         break;
710         }
711 fsck_err:
712         return ret;
713 }
714
715 static int jset_validate(struct bch_fs *c,
716                          struct bch_dev *ca,
717                          struct jset *jset, u64 sector,
718                          unsigned bucket_sectors_left,
719                          unsigned sectors_read,
720                          int write)
721 {
722         size_t bytes = vstruct_bytes(jset);
723         struct bch_csum csum;
724         unsigned version;
725         int ret = 0;
726
727         if (le64_to_cpu(jset->magic) != jset_magic(c))
728                 return JOURNAL_ENTRY_NONE;
729
730         version = le32_to_cpu(jset->version);
731         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
732                                   version < bcachefs_metadata_version_min) ||
733                                  version >= bcachefs_metadata_version_max,
734                                  c, jset, NULL,
735                         "%s sector %llu seq %llu: unknown journal entry version %u",
736                         ca ? ca->name : c->name,
737                         sector, le64_to_cpu(jset->seq),
738                         version)) {
739                 /* don't try to continue: */
740                 return EINVAL;
741         }
742
743         if (bytes > (sectors_read << 9) &&
744             sectors_read < bucket_sectors_left)
745                 return JOURNAL_ENTRY_REREAD;
746
747         if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
748                                  c, jset, NULL,
749                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
750                         ca ? ca->name : c->name,
751                         sector, le64_to_cpu(jset->seq), bytes)) {
752                 ret = JOURNAL_ENTRY_BAD;
753                 le32_add_cpu(&jset->u64s,
754                              -((bytes - (bucket_sectors_left << 9)) / 8));
755         }
756
757         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
758                                  c, jset, NULL,
759                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
760                         ca ? ca->name : c->name,
761                         sector, le64_to_cpu(jset->seq),
762                         JSET_CSUM_TYPE(jset))) {
763                 ret = JOURNAL_ENTRY_BAD;
764                 goto csum_done;
765         }
766
767         if (write)
768                 goto csum_done;
769
770         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
771         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum),
772                                  c, jset, NULL,
773                                  "%s sector %llu seq %llu: journal checksum bad",
774                                  ca ? ca->name : c->name,
775                                  sector, le64_to_cpu(jset->seq)))
776                 ret = JOURNAL_ENTRY_BAD;
777
778         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
779                      jset->encrypted_start,
780                      vstruct_end(jset) - (void *) jset->encrypted_start);
781         bch2_fs_fatal_err_on(ret, c,
782                         "error decrypting journal entry: %i", ret);
783 csum_done:
784         /* last_seq is ignored when JSET_NO_FLUSH is true */
785         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
786                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
787                                  c, jset, NULL,
788                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
789                                  le64_to_cpu(jset->last_seq),
790                                  le64_to_cpu(jset->seq))) {
791                 jset->last_seq = jset->seq;
792                 return JOURNAL_ENTRY_BAD;
793         }
794 fsck_err:
795         return ret;
796 }
797
798 static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
799 {
800         unsigned sectors = vstruct_sectors(jset, c->block_bits);
801
802         return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
803                 jset_validate_entries(c, jset, WRITE);
804 }
805
806 struct journal_read_buf {
807         void            *data;
808         size_t          size;
809 };
810
811 static int journal_read_buf_realloc(struct journal_read_buf *b,
812                                     size_t new_size)
813 {
814         void *n;
815
816         /* the bios are sized for this many pages, max: */
817         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
818                 return -ENOMEM;
819
820         new_size = roundup_pow_of_two(new_size);
821         n = kvpmalloc(new_size, GFP_KERNEL);
822         if (!n)
823                 return -ENOMEM;
824
825         kvpfree(b->data, b->size);
826         b->data = n;
827         b->size = new_size;
828         return 0;
829 }
830
831 static int journal_read_bucket(struct bch_dev *ca,
832                                struct journal_read_buf *buf,
833                                struct journal_list *jlist,
834                                unsigned bucket)
835 {
836         struct bch_fs *c = ca->fs;
837         struct journal_device *ja = &ca->journal;
838         struct jset *j = NULL;
839         unsigned sectors, sectors_read = 0;
840         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
841             end = offset + ca->mi.bucket_size;
842         bool saw_bad = false;
843         int ret = 0;
844
845         pr_debug("reading %u", bucket);
846
847         while (offset < end) {
848                 if (!sectors_read) {
849                         struct bio *bio;
850                         unsigned nr_bvecs;
851 reread:
852                         sectors_read = min_t(unsigned,
853                                 end - offset, buf->size >> 9);
854                         nr_bvecs = buf_pages(buf->data, sectors_read << 9);
855
856                         bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
857                         bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
858
859                         bio->bi_iter.bi_sector = offset;
860                         bch2_bio_map(bio, buf->data, sectors_read << 9);
861
862                         ret = submit_bio_wait(bio);
863                         kfree(bio);
864
865                         if (bch2_dev_io_err_on(ret, ca,
866                                                "journal read error: sector %llu",
867                                                offset) ||
868                             bch2_meta_read_fault("journal")) {
869                                 /*
870                                  * We don't error out of the recovery process
871                                  * here, since the relevant journal entry may be
872                                  * found on a different device, and missing or
873                                  * no journal entries will be handled later
874                                  */
875                                 return 0;
876                         }
877
878                         j = buf->data;
879                 }
880
881                 ret = jset_validate(c, ca, j, offset,
882                                     end - offset, sectors_read,
883                                     READ);
884                 switch (ret) {
885                 case 0:
886                         sectors = vstruct_sectors(j, c->block_bits);
887                         break;
888                 case JOURNAL_ENTRY_REREAD:
889                         if (vstruct_bytes(j) > buf->size) {
890                                 ret = journal_read_buf_realloc(buf,
891                                                         vstruct_bytes(j));
892                                 if (ret)
893                                         return ret;
894                         }
895                         goto reread;
896                 case JOURNAL_ENTRY_NONE:
897                         if (!saw_bad)
898                                 return 0;
899                         sectors = block_sectors(c);
900                         goto next_block;
901                 case JOURNAL_ENTRY_BAD:
902                         saw_bad = true;
903                         /*
904                          * On checksum error we don't really trust the size
905                          * field of the journal entry we read, so try reading
906                          * again at next block boundary:
907                          */
908                         sectors = block_sectors(c);
909                         break;
910                 default:
911                         return ret;
912                 }
913
914                 /*
915                  * This happens sometimes if we don't have discards on -
916                  * when we've partially overwritten a bucket with new
917                  * journal entries. We don't need the rest of the
918                  * bucket:
919                  */
920                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
921                         return 0;
922
923                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
924
925                 mutex_lock(&jlist->lock);
926                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
927                                         .dev            = ca->dev_idx,
928                                         .bucket         = bucket,
929                                         .bucket_offset  = offset -
930                                                 bucket_to_sector(ca, ja->buckets[bucket]),
931                                         .sector         = offset,
932                                         }, jlist, j, ret != 0);
933                 mutex_unlock(&jlist->lock);
934
935                 switch (ret) {
936                 case JOURNAL_ENTRY_ADD_OK:
937                         break;
938                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
939                         break;
940                 default:
941                         return ret;
942                 }
943 next_block:
944                 pr_debug("next");
945                 offset          += sectors;
946                 sectors_read    -= sectors;
947                 j = ((void *) j) + (sectors << 9);
948         }
949
950         return 0;
951 }
952
953 static void bch2_journal_read_device(struct closure *cl)
954 {
955         struct journal_device *ja =
956                 container_of(cl, struct journal_device, read);
957         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
958         struct bch_fs *c = ca->fs;
959         struct journal_list *jlist =
960                 container_of(cl->parent, struct journal_list, cl);
961         struct journal_replay *r, **_r;
962         struct genradix_iter iter;
963         struct journal_read_buf buf = { NULL, 0 };
964         u64 min_seq = U64_MAX;
965         unsigned i;
966         int ret = 0;
967
968         if (!ja->nr)
969                 goto out;
970
971         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
972         if (ret)
973                 goto err;
974
975         pr_debug("%u journal buckets", ja->nr);
976
977         for (i = 0; i < ja->nr; i++) {
978                 ret = journal_read_bucket(ca, &buf, jlist, i);
979                 if (ret)
980                         goto err;
981         }
982
983         /* Find the journal bucket with the highest sequence number: */
984         for (i = 0; i < ja->nr; i++) {
985                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
986                         ja->cur_idx = i;
987
988                 min_seq = min(ja->bucket_seq[i], min_seq);
989         }
990
991         /*
992          * If there's duplicate journal entries in multiple buckets (which
993          * definitely isn't supposed to happen, but...) - make sure to start
994          * cur_idx at the last of those buckets, so we don't deadlock trying to
995          * allocate
996          */
997         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
998                ja->bucket_seq[ja->cur_idx] ==
999                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
1000                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1001
1002         ja->sectors_free = ca->mi.bucket_size;
1003
1004         mutex_lock(&jlist->lock);
1005         genradix_for_each(&c->journal_entries, iter, _r) {
1006                 r = *_r;
1007
1008                 if (!r)
1009                         continue;
1010
1011                 for (i = 0; i < r->nr_ptrs; i++) {
1012                         if (r->ptrs[i].dev == ca->dev_idx &&
1013                             sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
1014                                 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
1015                                         vstruct_sectors(&r->j, c->block_bits);
1016
1017                                 ja->sectors_free = min(ja->sectors_free,
1018                                                        ca->mi.bucket_size - wrote);
1019                         }
1020                 }
1021         }
1022         mutex_unlock(&jlist->lock);
1023
1024         if (ja->bucket_seq[ja->cur_idx] &&
1025             ja->sectors_free == ca->mi.bucket_size) {
1026                 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1027                 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1028                 for (i = 0; i < 3; i++) {
1029                         unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1030                         bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1031                 }
1032                 ja->sectors_free = 0;
1033         }
1034
1035         /*
1036          * Set dirty_idx to indicate the entire journal is full and needs to be
1037          * reclaimed - journal reclaim will immediately reclaim whatever isn't
1038          * pinned when it first runs:
1039          */
1040         ja->discard_idx = ja->dirty_idx_ondisk =
1041                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1042 out:
1043         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1044         kvpfree(buf.data, buf.size);
1045         percpu_ref_put(&ca->io_ref);
1046         closure_return(cl);
1047         return;
1048 err:
1049         mutex_lock(&jlist->lock);
1050         jlist->ret = ret;
1051         mutex_unlock(&jlist->lock);
1052         goto out;
1053 }
1054
1055 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1056                                struct journal_replay *j)
1057 {
1058         unsigned i;
1059
1060         for (i = 0; i < j->nr_ptrs; i++) {
1061                 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
1062                 u64 offset;
1063
1064                 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
1065
1066                 if (i)
1067                         prt_printf(out, " ");
1068                 prt_printf(out, "%u:%u:%u (sector %llu)",
1069                        j->ptrs[i].dev,
1070                        j->ptrs[i].bucket,
1071                        j->ptrs[i].bucket_offset,
1072                        j->ptrs[i].sector);
1073         }
1074 }
1075
1076 int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
1077 {
1078         struct journal_list jlist;
1079         struct journal_replay *i, **_i, *prev = NULL;
1080         struct genradix_iter radix_iter;
1081         struct bch_dev *ca;
1082         unsigned iter;
1083         struct printbuf buf = PRINTBUF;
1084         size_t keys = 0, entries = 0;
1085         bool degraded = false;
1086         u64 seq, last_seq = 0;
1087         int ret = 0;
1088
1089         closure_init_stack(&jlist.cl);
1090         mutex_init(&jlist.lock);
1091         jlist.last_seq = 0;
1092         jlist.ret = 0;
1093
1094         for_each_member_device(ca, c, iter) {
1095                 if (!c->opts.fsck &&
1096                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1097                         continue;
1098
1099                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1100                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1101                     percpu_ref_tryget(&ca->io_ref))
1102                         closure_call(&ca->journal.read,
1103                                      bch2_journal_read_device,
1104                                      system_unbound_wq,
1105                                      &jlist.cl);
1106                 else
1107                         degraded = true;
1108         }
1109
1110         closure_sync(&jlist.cl);
1111
1112         if (jlist.ret)
1113                 return jlist.ret;
1114
1115         *start_seq = 0;
1116
1117         /*
1118          * Find most recent flush entry, and ignore newer non flush entries -
1119          * those entries will be blacklisted:
1120          */
1121         genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1122                 i = *_i;
1123
1124                 if (!i || i->ignore)
1125                         continue;
1126
1127                 if (!*start_seq)
1128                         *start_seq = le64_to_cpu(i->j.seq) + 1;
1129
1130                 if (!JSET_NO_FLUSH(&i->j)) {
1131                         last_seq        = le64_to_cpu(i->j.last_seq);
1132                         *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1133                         break;
1134                 }
1135
1136                 journal_replay_free(c, i);
1137         }
1138
1139         if (!*start_seq) {
1140                 bch_info(c, "journal read done, but no entries found");
1141                 return 0;
1142         }
1143
1144         if (!last_seq) {
1145                 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
1146                 ret = -1;
1147                 goto err;
1148         }
1149
1150         /* Drop blacklisted entries and entries older than last_seq: */
1151         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1152                 i = *_i;
1153
1154                 if (!i || i->ignore)
1155                         continue;
1156
1157                 seq = le64_to_cpu(i->j.seq);
1158                 if (seq < last_seq) {
1159                         journal_replay_free(c, i);
1160                         continue;
1161                 }
1162
1163                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1164                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1165                                     "found blacklisted journal entry %llu", seq);
1166
1167                         journal_replay_free(c, i);
1168                 }
1169         }
1170
1171         /* Check for missing entries: */
1172         seq = last_seq;
1173         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1174                 i = *_i;
1175
1176                 if (!i || i->ignore)
1177                         continue;
1178
1179                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1180
1181                 while (seq < le64_to_cpu(i->j.seq)) {
1182                         u64 missing_start, missing_end;
1183                         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1184
1185                         while (seq < le64_to_cpu(i->j.seq) &&
1186                                bch2_journal_seq_is_blacklisted(c, seq, false))
1187                                 seq++;
1188
1189                         if (seq == le64_to_cpu(i->j.seq))
1190                                 break;
1191
1192                         missing_start = seq;
1193
1194                         while (seq < le64_to_cpu(i->j.seq) &&
1195                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1196                                 seq++;
1197
1198                         if (prev) {
1199                                 bch2_journal_ptrs_to_text(&buf1, c, prev);
1200                                 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1201                         } else
1202                                 prt_printf(&buf1, "(none)");
1203                         bch2_journal_ptrs_to_text(&buf2, c, i);
1204
1205                         missing_end = seq - 1;
1206                         fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1207                                  "  prev at %s\n"
1208                                  "  next at %s",
1209                                  missing_start, missing_end,
1210                                  last_seq, *blacklist_seq - 1,
1211                                  buf1.buf, buf2.buf);
1212
1213                         printbuf_exit(&buf1);
1214                         printbuf_exit(&buf2);
1215                 }
1216
1217                 prev = i;
1218                 seq++;
1219         }
1220
1221         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1222                 struct jset_entry *entry;
1223                 struct bkey_i *k, *_n;
1224                 struct bch_replicas_padded replicas = {
1225                         .e.data_type = BCH_DATA_journal,
1226                         .e.nr_required = 1,
1227                 };
1228                 unsigned ptr;
1229
1230                 i = *_i;
1231                 if (!i || i->ignore)
1232                         continue;
1233
1234                 ret = jset_validate_entries(c, &i->j, READ);
1235                 if (ret)
1236                         goto err;
1237
1238                 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1239                         replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1240
1241                 bch2_replicas_entry_sort(&replicas.e);
1242
1243                 /*
1244                  * If we're mounting in degraded mode - if we didn't read all
1245                  * the devices - this is wrong:
1246                  */
1247
1248                 printbuf_reset(&buf);
1249                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1250
1251                 if (!degraded &&
1252                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1253                                 "superblock not marked as containing replicas %s",
1254                                 buf.buf)) {
1255                         ret = bch2_mark_replicas(c, &replicas.e);
1256                         if (ret)
1257                                 goto err;
1258                 }
1259
1260                 for_each_jset_key(k, _n, entry, &i->j)
1261                         keys++;
1262                 entries++;
1263         }
1264
1265         bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
1266                  keys, entries, *start_seq);
1267
1268         if (*start_seq != *blacklist_seq)
1269                 bch_info(c, "dropped unflushed entries %llu-%llu",
1270                          *blacklist_seq, *start_seq - 1);
1271 err:
1272 fsck_err:
1273         printbuf_exit(&buf);
1274         return ret;
1275 }
1276
1277 /* journal write: */
1278
1279 static void __journal_write_alloc(struct journal *j,
1280                                   struct journal_buf *w,
1281                                   struct dev_alloc_list *devs_sorted,
1282                                   unsigned sectors,
1283                                   unsigned *replicas,
1284                                   unsigned replicas_want)
1285 {
1286         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1287         struct journal_device *ja;
1288         struct bch_dev *ca;
1289         unsigned i;
1290
1291         if (*replicas >= replicas_want)
1292                 return;
1293
1294         for (i = 0; i < devs_sorted->nr; i++) {
1295                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1296                 if (!ca)
1297                         continue;
1298
1299                 ja = &ca->journal;
1300
1301                 /*
1302                  * Check that we can use this device, and aren't already using
1303                  * it:
1304                  */
1305                 if (!ca->mi.durability ||
1306                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1307                     !ja->nr ||
1308                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
1309                                          ca->dev_idx) ||
1310                     sectors > ja->sectors_free)
1311                         continue;
1312
1313                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1314
1315                 bch2_bkey_append_ptr(&w->key,
1316                         (struct bch_extent_ptr) {
1317                                   .offset = bucket_to_sector(ca,
1318                                         ja->buckets[ja->cur_idx]) +
1319                                         ca->mi.bucket_size -
1320                                         ja->sectors_free,
1321                                   .dev = ca->dev_idx,
1322                 });
1323
1324                 ja->sectors_free -= sectors;
1325                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1326
1327                 *replicas += ca->mi.durability;
1328
1329                 if (*replicas >= replicas_want)
1330                         break;
1331         }
1332 }
1333
1334 /**
1335  * journal_next_bucket - move on to the next journal bucket if possible
1336  */
1337 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1338                                unsigned sectors)
1339 {
1340         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1341         struct bch_devs_mask devs;
1342         struct journal_device *ja;
1343         struct bch_dev *ca;
1344         struct dev_alloc_list devs_sorted;
1345         unsigned target = c->opts.metadata_target ?:
1346                 c->opts.foreground_target;
1347         unsigned i, replicas = 0, replicas_want =
1348                 READ_ONCE(c->opts.metadata_replicas);
1349
1350         rcu_read_lock();
1351 retry:
1352         devs = target_rw_devs(c, BCH_DATA_journal, target);
1353
1354         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1355
1356         __journal_write_alloc(j, w, &devs_sorted,
1357                               sectors, &replicas, replicas_want);
1358
1359         if (replicas >= replicas_want)
1360                 goto done;
1361
1362         for (i = 0; i < devs_sorted.nr; i++) {
1363                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1364                 if (!ca)
1365                         continue;
1366
1367                 ja = &ca->journal;
1368
1369                 if (sectors > ja->sectors_free &&
1370                     sectors <= ca->mi.bucket_size &&
1371                     bch2_journal_dev_buckets_available(j, ja,
1372                                         journal_space_discarded)) {
1373                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1374                         ja->sectors_free = ca->mi.bucket_size;
1375
1376                         /*
1377                          * ja->bucket_seq[ja->cur_idx] must always have
1378                          * something sensible:
1379                          */
1380                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1381                 }
1382         }
1383
1384         __journal_write_alloc(j, w, &devs_sorted,
1385                               sectors, &replicas, replicas_want);
1386
1387         if (replicas < replicas_want && target) {
1388                 /* Retry from all devices: */
1389                 target = 0;
1390                 goto retry;
1391         }
1392 done:
1393         rcu_read_unlock();
1394
1395         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1396
1397         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1398 }
1399
1400 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1401 {
1402         /* we aren't holding j->lock: */
1403         unsigned new_size = READ_ONCE(j->buf_size_want);
1404         void *new_buf;
1405
1406         if (buf->buf_size >= new_size)
1407                 return;
1408
1409         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
1410         if (!new_buf)
1411                 return;
1412
1413         memcpy(new_buf, buf->data, buf->buf_size);
1414
1415         spin_lock(&j->lock);
1416         swap(buf->data,         new_buf);
1417         swap(buf->buf_size,     new_size);
1418         spin_unlock(&j->lock);
1419
1420         kvpfree(new_buf, new_size);
1421 }
1422
1423 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1424 {
1425         return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1426 }
1427
1428 static void journal_write_done(struct closure *cl)
1429 {
1430         struct journal *j = container_of(cl, struct journal, io);
1431         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1432         struct journal_buf *w = journal_last_unwritten_buf(j);
1433         struct bch_replicas_padded replicas;
1434         union journal_res_state old, new;
1435         u64 v, seq;
1436         int err = 0;
1437
1438         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1439                                ? j->flush_write_time
1440                                : j->noflush_write_time, j->write_start_time);
1441
1442         if (!w->devs_written.nr) {
1443                 bch_err(c, "unable to write journal to sufficient devices");
1444                 err = -EIO;
1445         } else {
1446                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1447                                          w->devs_written);
1448                 if (bch2_mark_replicas(c, &replicas.e))
1449                         err = -EIO;
1450         }
1451
1452         if (err)
1453                 bch2_fatal_error(c);
1454
1455         spin_lock(&j->lock);
1456         seq = le64_to_cpu(w->data->seq);
1457
1458         if (seq >= j->pin.front)
1459                 journal_seq_pin(j, seq)->devs = w->devs_written;
1460
1461         if (!err) {
1462                 if (!JSET_NO_FLUSH(w->data)) {
1463                         j->flushed_seq_ondisk = seq;
1464                         j->last_seq_ondisk = w->last_seq;
1465
1466                         bch2_do_discards(c);
1467                         closure_wake_up(&c->freelist_wait);
1468                 }
1469         } else if (!j->err_seq || seq < j->err_seq)
1470                 j->err_seq      = seq;
1471
1472         j->seq_ondisk           = seq;
1473
1474         /*
1475          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1476          * more buckets:
1477          *
1478          * Must come before signaling write completion, for
1479          * bch2_fs_journal_stop():
1480          */
1481         if (j->watermark)
1482                 journal_reclaim_kick(&c->journal);
1483
1484         /* also must come before signalling write completion: */
1485         closure_debug_destroy(cl);
1486
1487         v = atomic64_read(&j->reservations.counter);
1488         do {
1489                 old.v = new.v = v;
1490                 BUG_ON(journal_state_count(new, new.unwritten_idx));
1491
1492                 new.unwritten_idx++;
1493         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1494                                        old.v, new.v)) != old.v);
1495
1496         bch2_journal_space_available(j);
1497
1498         closure_wake_up(&w->wait);
1499         journal_wake(j);
1500
1501         if (!journal_state_count(new, new.unwritten_idx) &&
1502             journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1503                 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1504         } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1505                    new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1506                 struct journal_buf *buf = journal_cur_buf(j);
1507                 long delta = buf->expires - jiffies;
1508
1509                 /*
1510                  * We don't close a journal entry to write it while there's
1511                  * previous entries still in flight - the current journal entry
1512                  * might want to be written now:
1513                  */
1514
1515                 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
1516         }
1517
1518         spin_unlock(&j->lock);
1519 }
1520
1521 static void journal_write_endio(struct bio *bio)
1522 {
1523         struct bch_dev *ca = bio->bi_private;
1524         struct journal *j = &ca->fs->journal;
1525         struct journal_buf *w = journal_last_unwritten_buf(j);
1526         unsigned long flags;
1527
1528         if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1529                                le64_to_cpu(w->data->seq),
1530                                bch2_blk_status_to_str(bio->bi_status)) ||
1531             bch2_meta_write_fault("journal")) {
1532                 spin_lock_irqsave(&j->err_lock, flags);
1533                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1534                 spin_unlock_irqrestore(&j->err_lock, flags);
1535         }
1536
1537         closure_put(&j->io);
1538         percpu_ref_put(&ca->io_ref);
1539 }
1540
1541 static void do_journal_write(struct closure *cl)
1542 {
1543         struct journal *j = container_of(cl, struct journal, io);
1544         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1545         struct bch_dev *ca;
1546         struct journal_buf *w = journal_last_unwritten_buf(j);
1547         struct bch_extent_ptr *ptr;
1548         struct bio *bio;
1549         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1550
1551         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1552                 ca = bch_dev_bkey_exists(c, ptr->dev);
1553                 if (!percpu_ref_tryget(&ca->io_ref)) {
1554                         /* XXX: fix this */
1555                         bch_err(c, "missing device for journal write\n");
1556                         continue;
1557                 }
1558
1559                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1560                              sectors);
1561
1562                 bio = ca->journal.bio;
1563                 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1564                 bio->bi_iter.bi_sector  = ptr->offset;
1565                 bio->bi_end_io          = journal_write_endio;
1566                 bio->bi_private         = ca;
1567
1568                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1569                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1570
1571                 if (!JSET_NO_FLUSH(w->data))
1572                         bio->bi_opf    |= REQ_FUA;
1573                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1574                         bio->bi_opf    |= REQ_PREFLUSH;
1575
1576                 bch2_bio_map(bio, w->data, sectors << 9);
1577
1578                 trace_and_count(c, journal_write, bio);
1579                 closure_bio_submit(bio, cl);
1580
1581                 ca->journal.bucket_seq[ca->journal.cur_idx] =
1582                         le64_to_cpu(w->data->seq);
1583         }
1584
1585         continue_at(cl, journal_write_done, c->io_complete_wq);
1586         return;
1587 }
1588
1589 void bch2_journal_write(struct closure *cl)
1590 {
1591         struct journal *j = container_of(cl, struct journal, io);
1592         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1593         struct bch_dev *ca;
1594         struct journal_buf *w = journal_last_unwritten_buf(j);
1595         struct jset_entry *start, *end;
1596         struct jset *jset;
1597         struct bio *bio;
1598         struct printbuf journal_debug_buf = PRINTBUF;
1599         bool validate_before_checksum = false;
1600         unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
1601         int ret;
1602
1603         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1604
1605         journal_buf_realloc(j, w);
1606         jset = w->data;
1607
1608         j->write_start_time = local_clock();
1609
1610         spin_lock(&j->lock);
1611         if (bch2_journal_error(j) ||
1612             w->noflush ||
1613             (!w->must_flush &&
1614              (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1615              test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1616                 w->noflush = true;
1617                 SET_JSET_NO_FLUSH(jset, true);
1618                 jset->last_seq  = 0;
1619                 w->last_seq     = 0;
1620
1621                 j->nr_noflush_writes++;
1622         } else {
1623                 j->last_flush_write = jiffies;
1624                 j->nr_flush_writes++;
1625         }
1626         spin_unlock(&j->lock);
1627
1628         /*
1629          * New btree roots are set by journalling them; when the journal entry
1630          * gets written we have to propagate them to c->btree_roots
1631          *
1632          * But, every journal entry we write has to contain all the btree roots
1633          * (at least for now); so after we copy btree roots to c->btree_roots we
1634          * have to get any missing btree roots and add them to this journal
1635          * entry:
1636          */
1637
1638         bch2_journal_entries_to_btree_roots(c, jset);
1639
1640         start = end = vstruct_last(jset);
1641
1642         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1643
1644         bch2_journal_super_entries_add_common(c, &end,
1645                                 le64_to_cpu(jset->seq));
1646         u64s    = (u64 *) end - (u64 *) start;
1647         BUG_ON(u64s > j->entry_u64s_reserved);
1648
1649         le32_add_cpu(&jset->u64s, u64s);
1650         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1651
1652         jset->magic             = cpu_to_le64(jset_magic(c));
1653         jset->version           = c->sb.version < bcachefs_metadata_version_bkey_renumber
1654                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1655                 : cpu_to_le32(c->sb.version);
1656
1657         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1658         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1659
1660         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1661                 j->last_empty_seq = le64_to_cpu(jset->seq);
1662
1663         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1664                 validate_before_checksum = true;
1665
1666         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1667                 validate_before_checksum = true;
1668
1669         if (validate_before_checksum &&
1670             jset_validate_for_write(c, jset))
1671                 goto err;
1672
1673         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1674                     jset->encrypted_start,
1675                     vstruct_end(jset) - (void *) jset->encrypted_start);
1676         if (bch2_fs_fatal_err_on(ret, c,
1677                         "error decrypting journal entry: %i", ret))
1678                 goto err;
1679
1680         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1681                                   journal_nonce(jset), jset);
1682
1683         if (!validate_before_checksum &&
1684             jset_validate_for_write(c, jset))
1685                 goto err;
1686
1687         sectors = vstruct_sectors(jset, c->block_bits);
1688         BUG_ON(sectors > w->sectors);
1689
1690         bytes = vstruct_bytes(jset);
1691         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1692
1693 retry_alloc:
1694         spin_lock(&j->lock);
1695         ret = journal_write_alloc(j, w, sectors);
1696
1697         if (ret && j->can_discard) {
1698                 spin_unlock(&j->lock);
1699                 bch2_journal_do_discards(j);
1700                 goto retry_alloc;
1701         }
1702
1703         if (ret)
1704                 __bch2_journal_debug_to_text(&journal_debug_buf, j);
1705
1706         /*
1707          * write is allocated, no longer need to account for it in
1708          * bch2_journal_space_available():
1709          */
1710         w->sectors = 0;
1711
1712         /*
1713          * journal entry has been compacted and allocated, recalculate space
1714          * available:
1715          */
1716         bch2_journal_space_available(j);
1717         spin_unlock(&j->lock);
1718
1719         if (ret) {
1720                 bch_err(c, "Unable to allocate journal write:\n%s",
1721                         journal_debug_buf.buf);
1722                 printbuf_exit(&journal_debug_buf);
1723                 bch2_fatal_error(c);
1724                 continue_at(cl, journal_write_done, c->io_complete_wq);
1725                 return;
1726         }
1727
1728         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1729
1730         if (c->opts.nochanges)
1731                 goto no_io;
1732
1733         for_each_rw_member(ca, c, i)
1734                 nr_rw_members++;
1735
1736         if (nr_rw_members > 1)
1737                 w->separate_flush = true;
1738
1739         if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1740                 for_each_rw_member(ca, c, i) {
1741                         percpu_ref_get(&ca->io_ref);
1742
1743                         bio = ca->journal.bio;
1744                         bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1745                         bio->bi_end_io          = journal_write_endio;
1746                         bio->bi_private         = ca;
1747                         closure_bio_submit(bio, cl);
1748                 }
1749         }
1750
1751         continue_at(cl, do_journal_write, c->io_complete_wq);
1752         return;
1753 no_io:
1754         continue_at(cl, journal_write_done, c->io_complete_wq);
1755         return;
1756 err:
1757         bch2_fatal_error(c);
1758         continue_at(cl, journal_write_done, c->io_complete_wq);
1759 }