]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c
Update bcachefs sources to 367a8fad45 bcachefs: Reset journal flush delay to default...
[bcachefs-tools-debian] / libbcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "buckets.h"
8 #include "checksum.h"
9 #include "disk_groups.h"
10 #include "error.h"
11 #include "io.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17
18 #include <trace/events/bcachefs.h>
19
20 static void __journal_replay_free(struct journal_replay *i)
21 {
22         list_del(&i->list);
23         kvpfree(i, offsetof(struct journal_replay, j) +
24                 vstruct_bytes(&i->j));
25
26 }
27
28 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
29 {
30         i->ignore = true;
31
32         if (!c->opts.read_entire_journal)
33                 __journal_replay_free(i);
34 }
35
36 struct journal_list {
37         struct closure          cl;
38         struct mutex            lock;
39         struct list_head        *head;
40         int                     ret;
41 };
42
43 #define JOURNAL_ENTRY_ADD_OK            0
44 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
45
46 /*
47  * Given a journal entry we just read, add it to the list of journal entries to
48  * be replayed:
49  */
50 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
51                              struct journal_ptr entry_ptr,
52                              struct journal_list *jlist, struct jset *j,
53                              bool bad)
54 {
55         struct journal_replay *i, *pos, *dup = NULL;
56         struct journal_ptr *ptr;
57         struct list_head *where;
58         size_t bytes = vstruct_bytes(j);
59         u64 last_seq = 0;
60         int ret = JOURNAL_ENTRY_ADD_OK;
61
62         list_for_each_entry_reverse(i, jlist->head, list) {
63                 if (!JSET_NO_FLUSH(&i->j)) {
64                         last_seq = le64_to_cpu(i->j.last_seq);
65                         break;
66                 }
67         }
68
69         /* Is this entry older than the range we need? */
70         if (!c->opts.read_entire_journal &&
71             le64_to_cpu(j->seq) < last_seq) {
72                 ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
73                 goto out;
74         }
75
76         /* Drop entries we don't need anymore */
77         if (!JSET_NO_FLUSH(j)) {
78                 list_for_each_entry_safe(i, pos, jlist->head, list) {
79                         if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
80                                 break;
81                         journal_replay_free(c, i);
82                 }
83         }
84
85         list_for_each_entry_reverse(i, jlist->head, list) {
86                 if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
87                         where = &i->list;
88                         goto add;
89                 }
90         }
91
92         where = jlist->head;
93 add:
94         dup = where->next != jlist->head
95                 ? container_of(where->next, struct journal_replay, list)
96                 : NULL;
97
98         if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
99                 dup = NULL;
100
101         /*
102          * Duplicate journal entries? If so we want the one that didn't have a
103          * checksum error:
104          */
105         if (dup) {
106                 if (dup->bad) {
107                         /* we'll replace @dup: */
108                 } else if (bad) {
109                         i = dup;
110                         goto found;
111                 } else {
112                         fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
113                                     memcmp(j, &dup->j, bytes), c,
114                                     "found duplicate but non identical journal entries (seq %llu)",
115                                     le64_to_cpu(j->seq));
116                         i = dup;
117                         goto found;
118                 }
119         }
120
121         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
122         if (!i) {
123                 ret = -ENOMEM;
124                 goto out;
125         }
126
127         i->nr_ptrs       = 0;
128         i->bad          = bad;
129         i->ignore       = false;
130         memcpy(&i->j, j, bytes);
131
132         if (dup) {
133                 i->nr_ptrs = dup->nr_ptrs;
134                 memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
135                 __journal_replay_free(dup);
136         }
137
138         list_add(&i->list, where);
139 found:
140         for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
141                 if (ptr->dev == ca->dev_idx) {
142                         bch_err(c, "duplicate journal entry %llu on same device",
143                                 le64_to_cpu(i->j.seq));
144                         goto out;
145                 }
146         }
147
148         if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
149                 bch_err(c, "found too many copies of journal entry %llu",
150                         le64_to_cpu(i->j.seq));
151                 goto out;
152         }
153
154         i->ptrs[i->nr_ptrs++] = entry_ptr;
155 out:
156 fsck_err:
157         return ret;
158 }
159
160 static struct nonce journal_nonce(const struct jset *jset)
161 {
162         return (struct nonce) {{
163                 [0] = 0,
164                 [1] = ((__le32 *) &jset->seq)[0],
165                 [2] = ((__le32 *) &jset->seq)[1],
166                 [3] = BCH_NONCE_JOURNAL,
167         }};
168 }
169
170 /* this fills in a range with empty jset_entries: */
171 static void journal_entry_null_range(void *start, void *end)
172 {
173         struct jset_entry *entry;
174
175         for (entry = start; entry != end; entry = vstruct_next(entry))
176                 memset(entry, 0, sizeof(*entry));
177 }
178
179 #define JOURNAL_ENTRY_REREAD    5
180 #define JOURNAL_ENTRY_NONE      6
181 #define JOURNAL_ENTRY_BAD       7
182
183 #define journal_entry_err(c, msg, ...)                                  \
184 ({                                                                      \
185         switch (write) {                                                \
186         case READ:                                                      \
187                 mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
188                 break;                                                  \
189         case WRITE:                                                     \
190                 bch_err(c, "corrupt metadata before write:\n"           \
191                         msg, ##__VA_ARGS__);                            \
192                 if (bch2_fs_inconsistent(c)) {                          \
193                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
194                         goto fsck_err;                                  \
195                 }                                                       \
196                 break;                                                  \
197         }                                                               \
198         true;                                                           \
199 })
200
201 #define journal_entry_err_on(cond, c, msg, ...)                         \
202         ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
203
204 #define FSCK_DELETED_KEY        5
205
206 static int journal_validate_key(struct bch_fs *c, const char *where,
207                                 struct jset_entry *entry,
208                                 unsigned level, enum btree_id btree_id,
209                                 struct bkey_i *k, const char *type,
210                                 unsigned version, int big_endian, int write)
211 {
212         void *next = vstruct_next(entry);
213         const char *invalid;
214         int ret = 0;
215
216         if (journal_entry_err_on(!k->k.u64s, c,
217                         "invalid %s in %s entry offset %zi/%u: k->u64s 0",
218                         type, where,
219                         (u64 *) k - entry->_data,
220                         le16_to_cpu(entry->u64s))) {
221                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
222                 journal_entry_null_range(vstruct_next(entry), next);
223                 return FSCK_DELETED_KEY;
224         }
225
226         if (journal_entry_err_on((void *) bkey_next(k) >
227                                 (void *) vstruct_next(entry), c,
228                         "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
229                         type, where,
230                         (u64 *) k - entry->_data,
231                         le16_to_cpu(entry->u64s))) {
232                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
233                 journal_entry_null_range(vstruct_next(entry), next);
234                 return FSCK_DELETED_KEY;
235         }
236
237         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
238                         "invalid %s in %s entry offset %zi/%u: bad format %u",
239                         type, where,
240                         (u64 *) k - entry->_data,
241                         le16_to_cpu(entry->u64s),
242                         k->k.format)) {
243                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
244                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
245                 journal_entry_null_range(vstruct_next(entry), next);
246                 return FSCK_DELETED_KEY;
247         }
248
249         if (!write)
250                 bch2_bkey_compat(level, btree_id, version, big_endian,
251                                  write, NULL, bkey_to_packed(k));
252
253         invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
254                                     __btree_node_type(level, btree_id));
255         if (invalid) {
256                 struct printbuf buf = PRINTBUF;
257
258                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
259                 mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
260                                  type, where,
261                                  (u64 *) k - entry->_data,
262                                  le16_to_cpu(entry->u64s),
263                                  invalid, buf.buf);
264                 printbuf_exit(&buf);
265
266                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
267                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
268                 journal_entry_null_range(vstruct_next(entry), next);
269                 return FSCK_DELETED_KEY;
270         }
271
272         if (write)
273                 bch2_bkey_compat(level, btree_id, version, big_endian,
274                                  write, NULL, bkey_to_packed(k));
275 fsck_err:
276         return ret;
277 }
278
279 static int journal_entry_btree_keys_validate(struct bch_fs *c,
280                                              const char *where,
281                                              struct jset_entry *entry,
282                                              unsigned version, int big_endian, int write)
283 {
284         struct bkey_i *k = entry->start;
285
286         while (k != vstruct_last(entry)) {
287                 int ret = journal_validate_key(c, where, entry,
288                                                entry->level,
289                                                entry->btree_id,
290                                                k, "key", version, big_endian, write);
291                 if (ret == FSCK_DELETED_KEY)
292                         continue;
293
294                 k = bkey_next(k);
295         }
296
297         return 0;
298 }
299
300 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
301                                              struct jset_entry *entry)
302 {
303         struct bkey_i *k;
304         bool first = true;
305
306         vstruct_for_each(entry, k) {
307                 if (!first) {
308                         pr_newline(out);
309                         pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
310                 }
311                 pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
312                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
313                 first = false;
314         }
315 }
316
317 static int journal_entry_btree_root_validate(struct bch_fs *c,
318                                              const char *where,
319                                              struct jset_entry *entry,
320                                              unsigned version, int big_endian, int write)
321 {
322         struct bkey_i *k = entry->start;
323         int ret = 0;
324
325         if (journal_entry_err_on(!entry->u64s ||
326                                  le16_to_cpu(entry->u64s) != k->k.u64s, c,
327                                  "invalid btree root journal entry: wrong number of keys")) {
328                 void *next = vstruct_next(entry);
329                 /*
330                  * we don't want to null out this jset_entry,
331                  * just the contents, so that later we can tell
332                  * we were _supposed_ to have a btree root
333                  */
334                 entry->u64s = 0;
335                 journal_entry_null_range(vstruct_next(entry), next);
336                 return 0;
337         }
338
339         return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
340                                     "btree root", version, big_endian, write);
341 fsck_err:
342         return ret;
343 }
344
345 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
346                                              struct jset_entry *entry)
347 {
348         journal_entry_btree_keys_to_text(out, c, entry);
349 }
350
351 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
352                                             const char *where,
353                                             struct jset_entry *entry,
354                                             unsigned version, int big_endian, int write)
355 {
356         /* obsolete, don't care: */
357         return 0;
358 }
359
360 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
361                                             struct jset_entry *entry)
362 {
363 }
364
365 static int journal_entry_blacklist_validate(struct bch_fs *c,
366                                             const char *where,
367                                             struct jset_entry *entry,
368                                             unsigned version, int big_endian, int write)
369 {
370         int ret = 0;
371
372         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
373                 "invalid journal seq blacklist entry: bad size")) {
374                 journal_entry_null_range(entry, vstruct_next(entry));
375         }
376 fsck_err:
377         return ret;
378 }
379
380 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
381                                             struct jset_entry *entry)
382 {
383         struct jset_entry_blacklist *bl =
384                 container_of(entry, struct jset_entry_blacklist, entry);
385
386         pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
387 }
388
389 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
390                                                const char *where,
391                                                struct jset_entry *entry,
392                                                unsigned version, int big_endian, int write)
393 {
394         struct jset_entry_blacklist_v2 *bl_entry;
395         int ret = 0;
396
397         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
398                 "invalid journal seq blacklist entry: bad size")) {
399                 journal_entry_null_range(entry, vstruct_next(entry));
400                 goto out;
401         }
402
403         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
404
405         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
406                                  le64_to_cpu(bl_entry->end), c,
407                 "invalid journal seq blacklist entry: start > end")) {
408                 journal_entry_null_range(entry, vstruct_next(entry));
409         }
410 out:
411 fsck_err:
412         return ret;
413 }
414
415 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
416                                                struct jset_entry *entry)
417 {
418         struct jset_entry_blacklist_v2 *bl =
419                 container_of(entry, struct jset_entry_blacklist_v2, entry);
420
421         pr_buf(out, "start=%llu end=%llu",
422                le64_to_cpu(bl->start),
423                le64_to_cpu(bl->end));
424 }
425
426 static int journal_entry_usage_validate(struct bch_fs *c,
427                                         const char *where,
428                                         struct jset_entry *entry,
429                                         unsigned version, int big_endian, int write)
430 {
431         struct jset_entry_usage *u =
432                 container_of(entry, struct jset_entry_usage, entry);
433         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
434         int ret = 0;
435
436         if (journal_entry_err_on(bytes < sizeof(*u),
437                                  c,
438                                  "invalid journal entry usage: bad size")) {
439                 journal_entry_null_range(entry, vstruct_next(entry));
440                 return ret;
441         }
442
443 fsck_err:
444         return ret;
445 }
446
447 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
448                                         struct jset_entry *entry)
449 {
450         struct jset_entry_usage *u =
451                 container_of(entry, struct jset_entry_usage, entry);
452
453         pr_buf(out, "type=%s v=%llu",
454                bch2_fs_usage_types[u->entry.btree_id],
455                le64_to_cpu(u->v));
456 }
457
458 static int journal_entry_data_usage_validate(struct bch_fs *c,
459                                         const char *where,
460                                         struct jset_entry *entry,
461                                         unsigned version, int big_endian, int write)
462 {
463         struct jset_entry_data_usage *u =
464                 container_of(entry, struct jset_entry_data_usage, entry);
465         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
466         int ret = 0;
467
468         if (journal_entry_err_on(bytes < sizeof(*u) ||
469                                  bytes < sizeof(*u) + u->r.nr_devs,
470                                  c,
471                                  "invalid journal entry usage: bad size")) {
472                 journal_entry_null_range(entry, vstruct_next(entry));
473                 return ret;
474         }
475
476 fsck_err:
477         return ret;
478 }
479
480 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
481                                              struct jset_entry *entry)
482 {
483         struct jset_entry_data_usage *u =
484                 container_of(entry, struct jset_entry_data_usage, entry);
485
486         bch2_replicas_entry_to_text(out, &u->r);
487         pr_buf(out, "=%llu", le64_to_cpu(u->v));
488 }
489
490 static int journal_entry_clock_validate(struct bch_fs *c,
491                                         const char *where,
492                                         struct jset_entry *entry,
493                                         unsigned version, int big_endian, int write)
494 {
495         struct jset_entry_clock *clock =
496                 container_of(entry, struct jset_entry_clock, entry);
497         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
498         int ret = 0;
499
500         if (journal_entry_err_on(bytes != sizeof(*clock),
501                                  c, "invalid journal entry clock: bad size")) {
502                 journal_entry_null_range(entry, vstruct_next(entry));
503                 return ret;
504         }
505
506         if (journal_entry_err_on(clock->rw > 1,
507                                  c, "invalid journal entry clock: bad rw")) {
508                 journal_entry_null_range(entry, vstruct_next(entry));
509                 return ret;
510         }
511
512 fsck_err:
513         return ret;
514 }
515
516 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
517                                         struct jset_entry *entry)
518 {
519         struct jset_entry_clock *clock =
520                 container_of(entry, struct jset_entry_clock, entry);
521
522         pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
523 }
524
525 static int journal_entry_dev_usage_validate(struct bch_fs *c,
526                                             const char *where,
527                                             struct jset_entry *entry,
528                                             unsigned version, int big_endian, int write)
529 {
530         struct jset_entry_dev_usage *u =
531                 container_of(entry, struct jset_entry_dev_usage, entry);
532         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
533         unsigned expected = sizeof(*u);
534         unsigned dev;
535         int ret = 0;
536
537         if (journal_entry_err_on(bytes < expected,
538                                  c, "invalid journal entry dev usage: bad size (%u < %u)",
539                                  bytes, expected)) {
540                 journal_entry_null_range(entry, vstruct_next(entry));
541                 return ret;
542         }
543
544         dev = le32_to_cpu(u->dev);
545
546         if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
547                                  c, "invalid journal entry dev usage: bad dev")) {
548                 journal_entry_null_range(entry, vstruct_next(entry));
549                 return ret;
550         }
551
552         if (journal_entry_err_on(u->pad,
553                                  c, "invalid journal entry dev usage: bad pad")) {
554                 journal_entry_null_range(entry, vstruct_next(entry));
555                 return ret;
556         }
557
558 fsck_err:
559         return ret;
560 }
561
562 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
563                                             struct jset_entry *entry)
564 {
565         struct jset_entry_dev_usage *u =
566                 container_of(entry, struct jset_entry_dev_usage, entry);
567         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
568
569         pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
570
571         for (i = 0; i < nr_types; i++) {
572                 if (i < BCH_DATA_NR)
573                         pr_buf(out, " %s", bch2_data_types[i]);
574                 else
575                         pr_buf(out, " (unknown data type %u)", i);
576                 pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
577                        le64_to_cpu(u->d[i].buckets),
578                        le64_to_cpu(u->d[i].sectors),
579                        le64_to_cpu(u->d[i].fragmented));
580         }
581
582         pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
583                le64_to_cpu(u->buckets_ec),
584                le64_to_cpu(u->buckets_unavailable));
585 }
586
587 static int journal_entry_log_validate(struct bch_fs *c,
588                                       const char *where,
589                                       struct jset_entry *entry,
590                                       unsigned version, int big_endian, int write)
591 {
592         return 0;
593 }
594
595 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
596                                       struct jset_entry *entry)
597 {
598         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
599         unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
600
601         pr_buf(out, "%.*s", bytes, l->d);
602 }
603
604 struct jset_entry_ops {
605         int (*validate)(struct bch_fs *, const char *,
606                         struct jset_entry *, unsigned, int, int);
607         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
608 };
609
610 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
611 #define x(f, nr)                                                \
612         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
613                 .validate       = journal_entry_##f##_validate, \
614                 .to_text        = journal_entry_##f##_to_text,  \
615         },
616         BCH_JSET_ENTRY_TYPES()
617 #undef x
618 };
619
620 int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
621                                 struct jset_entry *entry,
622                                 unsigned version, int big_endian, int write)
623 {
624         return entry->type < BCH_JSET_ENTRY_NR
625                 ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
626                                 version, big_endian, write)
627                 : 0;
628 }
629
630 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
631                                 struct jset_entry *entry)
632 {
633         if (entry->type < BCH_JSET_ENTRY_NR) {
634                 pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
635                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
636         } else {
637                 pr_buf(out, "(unknown type %u)", entry->type);
638         }
639 }
640
641 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
642                                  int write)
643 {
644         char buf[100];
645         struct jset_entry *entry;
646         int ret = 0;
647
648         vstruct_for_each(jset, entry) {
649                 scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
650                           le64_to_cpu(jset->seq),
651                           (u64 *) entry - jset->_data,
652                           le32_to_cpu(jset->u64s));
653
654                 if (journal_entry_err_on(vstruct_next(entry) >
655                                          vstruct_last(jset), c,
656                                 "journal entry extends past end of jset")) {
657                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
658                         break;
659                 }
660
661                 ret = bch2_journal_entry_validate(c, buf, entry,
662                                         le32_to_cpu(jset->version),
663                                         JSET_BIG_ENDIAN(jset), write);
664                 if (ret)
665                         break;
666         }
667 fsck_err:
668         return ret;
669 }
670
671 static int jset_validate(struct bch_fs *c,
672                          struct bch_dev *ca,
673                          struct jset *jset, u64 sector,
674                          unsigned bucket_sectors_left,
675                          unsigned sectors_read,
676                          int write)
677 {
678         size_t bytes = vstruct_bytes(jset);
679         struct bch_csum csum;
680         unsigned version;
681         int ret = 0;
682
683         if (le64_to_cpu(jset->magic) != jset_magic(c))
684                 return JOURNAL_ENTRY_NONE;
685
686         version = le32_to_cpu(jset->version);
687         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
688                                   version < bcachefs_metadata_version_min) ||
689                                  version >= bcachefs_metadata_version_max, c,
690                         "%s sector %llu seq %llu: unknown journal entry version %u",
691                         ca ? ca->name : c->name,
692                         sector, le64_to_cpu(jset->seq),
693                         version)) {
694                 /* don't try to continue: */
695                 return EINVAL;
696         }
697
698         if (bytes > (sectors_read << 9) &&
699             sectors_read < bucket_sectors_left)
700                 return JOURNAL_ENTRY_REREAD;
701
702         if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
703                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
704                         ca ? ca->name : c->name,
705                         sector, le64_to_cpu(jset->seq), bytes)) {
706                 ret = JOURNAL_ENTRY_BAD;
707                 le32_add_cpu(&jset->u64s,
708                              -((bytes - (bucket_sectors_left << 9)) / 8));
709         }
710
711         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
712                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
713                         ca ? ca->name : c->name,
714                         sector, le64_to_cpu(jset->seq),
715                         JSET_CSUM_TYPE(jset))) {
716                 ret = JOURNAL_ENTRY_BAD;
717                 goto csum_done;
718         }
719
720         if (write)
721                 goto csum_done;
722
723         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
724         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
725                                  "%s sector %llu seq %llu: journal checksum bad",
726                                  ca ? ca->name : c->name,
727                                  sector, le64_to_cpu(jset->seq)))
728                 ret = JOURNAL_ENTRY_BAD;
729
730         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
731                      jset->encrypted_start,
732                      vstruct_end(jset) - (void *) jset->encrypted_start);
733         bch2_fs_fatal_err_on(ret, c,
734                         "error decrypting journal entry: %i", ret);
735 csum_done:
736         /* last_seq is ignored when JSET_NO_FLUSH is true */
737         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
738                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
739                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
740                                  le64_to_cpu(jset->last_seq),
741                                  le64_to_cpu(jset->seq))) {
742                 jset->last_seq = jset->seq;
743                 return JOURNAL_ENTRY_BAD;
744         }
745 fsck_err:
746         return ret;
747 }
748
749 static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
750 {
751         unsigned sectors = vstruct_sectors(jset, c->block_bits);
752
753         return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
754                 jset_validate_entries(c, jset, WRITE);
755 }
756
757 struct journal_read_buf {
758         void            *data;
759         size_t          size;
760 };
761
762 static int journal_read_buf_realloc(struct journal_read_buf *b,
763                                     size_t new_size)
764 {
765         void *n;
766
767         /* the bios are sized for this many pages, max: */
768         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
769                 return -ENOMEM;
770
771         new_size = roundup_pow_of_two(new_size);
772         n = kvpmalloc(new_size, GFP_KERNEL);
773         if (!n)
774                 return -ENOMEM;
775
776         kvpfree(b->data, b->size);
777         b->data = n;
778         b->size = new_size;
779         return 0;
780 }
781
782 static int journal_read_bucket(struct bch_dev *ca,
783                                struct journal_read_buf *buf,
784                                struct journal_list *jlist,
785                                unsigned bucket)
786 {
787         struct bch_fs *c = ca->fs;
788         struct journal_device *ja = &ca->journal;
789         struct jset *j = NULL;
790         unsigned sectors, sectors_read = 0;
791         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
792             end = offset + ca->mi.bucket_size;
793         bool saw_bad = false;
794         int ret = 0;
795
796         pr_debug("reading %u", bucket);
797
798         while (offset < end) {
799                 if (!sectors_read) {
800                         struct bio *bio;
801 reread:
802                         sectors_read = min_t(unsigned,
803                                 end - offset, buf->size >> 9);
804
805                         bio = bio_kmalloc(GFP_KERNEL,
806                                           buf_pages(buf->data,
807                                                     sectors_read << 9));
808                         bio_set_dev(bio, ca->disk_sb.bdev);
809                         bio->bi_iter.bi_sector  = offset;
810                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
811                         bch2_bio_map(bio, buf->data, sectors_read << 9);
812
813                         ret = submit_bio_wait(bio);
814                         bio_put(bio);
815
816                         if (bch2_dev_io_err_on(ret, ca,
817                                                "journal read error: sector %llu",
818                                                offset) ||
819                             bch2_meta_read_fault("journal")) {
820                                 /*
821                                  * We don't error out of the recovery process
822                                  * here, since the relevant journal entry may be
823                                  * found on a different device, and missing or
824                                  * no journal entries will be handled later
825                                  */
826                                 return 0;
827                         }
828
829                         j = buf->data;
830                 }
831
832                 ret = jset_validate(c, ca, j, offset,
833                                     end - offset, sectors_read,
834                                     READ);
835                 switch (ret) {
836                 case BCH_FSCK_OK:
837                         sectors = vstruct_sectors(j, c->block_bits);
838                         break;
839                 case JOURNAL_ENTRY_REREAD:
840                         if (vstruct_bytes(j) > buf->size) {
841                                 ret = journal_read_buf_realloc(buf,
842                                                         vstruct_bytes(j));
843                                 if (ret)
844                                         return ret;
845                         }
846                         goto reread;
847                 case JOURNAL_ENTRY_NONE:
848                         if (!saw_bad)
849                                 return 0;
850                         sectors = block_sectors(c);
851                         goto next_block;
852                 case JOURNAL_ENTRY_BAD:
853                         saw_bad = true;
854                         /*
855                          * On checksum error we don't really trust the size
856                          * field of the journal entry we read, so try reading
857                          * again at next block boundary:
858                          */
859                         sectors = block_sectors(c);
860                         break;
861                 default:
862                         return ret;
863                 }
864
865                 /*
866                  * This happens sometimes if we don't have discards on -
867                  * when we've partially overwritten a bucket with new
868                  * journal entries. We don't need the rest of the
869                  * bucket:
870                  */
871                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
872                         return 0;
873
874                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
875
876                 mutex_lock(&jlist->lock);
877                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
878                                         .dev            = ca->dev_idx,
879                                         .bucket         = bucket,
880                                         .bucket_offset  = offset -
881                                                 bucket_to_sector(ca, ja->buckets[bucket]),
882                                         .sector         = offset,
883                                         }, jlist, j, ret != 0);
884                 mutex_unlock(&jlist->lock);
885
886                 switch (ret) {
887                 case JOURNAL_ENTRY_ADD_OK:
888                         break;
889                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
890                         break;
891                 default:
892                         return ret;
893                 }
894 next_block:
895                 pr_debug("next");
896                 offset          += sectors;
897                 sectors_read    -= sectors;
898                 j = ((void *) j) + (sectors << 9);
899         }
900
901         return 0;
902 }
903
904 static void bch2_journal_read_device(struct closure *cl)
905 {
906         struct journal_device *ja =
907                 container_of(cl, struct journal_device, read);
908         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
909         struct bch_fs *c = ca->fs;
910         struct journal_list *jlist =
911                 container_of(cl->parent, struct journal_list, cl);
912         struct journal_replay *r;
913         struct journal_read_buf buf = { NULL, 0 };
914         u64 min_seq = U64_MAX;
915         unsigned i;
916         int ret = 0;
917
918         if (!ja->nr)
919                 goto out;
920
921         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
922         if (ret)
923                 goto err;
924
925         pr_debug("%u journal buckets", ja->nr);
926
927         for (i = 0; i < ja->nr; i++) {
928                 ret = journal_read_bucket(ca, &buf, jlist, i);
929                 if (ret)
930                         goto err;
931         }
932
933         /* Find the journal bucket with the highest sequence number: */
934         for (i = 0; i < ja->nr; i++) {
935                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
936                         ja->cur_idx = i;
937
938                 min_seq = min(ja->bucket_seq[i], min_seq);
939         }
940
941         /*
942          * If there's duplicate journal entries in multiple buckets (which
943          * definitely isn't supposed to happen, but...) - make sure to start
944          * cur_idx at the last of those buckets, so we don't deadlock trying to
945          * allocate
946          */
947         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
948                ja->bucket_seq[ja->cur_idx] ==
949                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
950                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
951
952         ja->sectors_free = ca->mi.bucket_size;
953
954         mutex_lock(&jlist->lock);
955         list_for_each_entry(r, jlist->head, list) {
956                 for (i = 0; i < r->nr_ptrs; i++) {
957                         if (r->ptrs[i].dev == ca->dev_idx &&
958                             sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
959                                 unsigned wrote = (r->ptrs[i].sector % ca->mi.bucket_size) +
960                                         vstruct_sectors(&r->j, c->block_bits);
961
962                                 ja->sectors_free = min(ja->sectors_free,
963                                                        ca->mi.bucket_size - wrote);
964                         }
965                 }
966         }
967         mutex_unlock(&jlist->lock);
968
969         if (ja->bucket_seq[ja->cur_idx] &&
970             ja->sectors_free == ca->mi.bucket_size) {
971                 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
972                 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
973                 for (i = 0; i < 3; i++) {
974                         unsigned idx = ja->cur_idx - 1 + i;
975                         bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
976                 }
977                 ja->sectors_free = 0;
978         }
979
980         /*
981          * Set dirty_idx to indicate the entire journal is full and needs to be
982          * reclaimed - journal reclaim will immediately reclaim whatever isn't
983          * pinned when it first runs:
984          */
985         ja->discard_idx = ja->dirty_idx_ondisk =
986                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
987 out:
988         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
989         kvpfree(buf.data, buf.size);
990         percpu_ref_put(&ca->io_ref);
991         closure_return(cl);
992         return;
993 err:
994         mutex_lock(&jlist->lock);
995         jlist->ret = ret;
996         mutex_unlock(&jlist->lock);
997         goto out;
998 }
999
1000 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1001                                struct journal_replay *j)
1002 {
1003         unsigned i;
1004
1005         for (i = 0; i < j->nr_ptrs; i++) {
1006                 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
1007                 u64 offset;
1008
1009                 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
1010
1011                 if (i)
1012                         pr_buf(out, " ");
1013                 pr_buf(out, "%u:%u:%u (sector %llu)",
1014                        j->ptrs[i].dev,
1015                        j->ptrs[i].bucket,
1016                        j->ptrs[i].bucket_offset,
1017                        j->ptrs[i].sector);
1018         }
1019 }
1020
1021 int bch2_journal_read(struct bch_fs *c, struct list_head *list,
1022                       u64 *blacklist_seq, u64 *start_seq)
1023 {
1024         struct journal_list jlist;
1025         struct journal_replay *i, *t;
1026         struct bch_dev *ca;
1027         unsigned iter;
1028         struct printbuf buf = PRINTBUF;
1029         size_t keys = 0, entries = 0;
1030         bool degraded = false;
1031         u64 seq, last_seq = 0;
1032         int ret = 0;
1033
1034         closure_init_stack(&jlist.cl);
1035         mutex_init(&jlist.lock);
1036         jlist.head = list;
1037         jlist.ret = 0;
1038
1039         for_each_member_device(ca, c, iter) {
1040                 if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
1041                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1042                         continue;
1043
1044                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1045                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1046                     percpu_ref_tryget(&ca->io_ref))
1047                         closure_call(&ca->journal.read,
1048                                      bch2_journal_read_device,
1049                                      system_unbound_wq,
1050                                      &jlist.cl);
1051                 else
1052                         degraded = true;
1053         }
1054
1055         closure_sync(&jlist.cl);
1056
1057         if (jlist.ret)
1058                 return jlist.ret;
1059
1060         if (list_empty(list)) {
1061                 bch_info(c, "journal read done, but no entries found");
1062                 return 0;
1063         }
1064
1065         i = list_last_entry(list, struct journal_replay, list);
1066         *start_seq = le64_to_cpu(i->j.seq) + 1;
1067
1068         /*
1069          * Find most recent flush entry, and ignore newer non flush entries -
1070          * those entries will be blacklisted:
1071          */
1072         list_for_each_entry_safe_reverse(i, t, list, list) {
1073                 if (i->ignore)
1074                         continue;
1075
1076                 if (!JSET_NO_FLUSH(&i->j)) {
1077                         last_seq        = le64_to_cpu(i->j.last_seq);
1078                         *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1079                         break;
1080                 }
1081
1082                 journal_replay_free(c, i);
1083         }
1084
1085         if (!last_seq) {
1086                 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
1087                 ret = -1;
1088                 goto err;
1089         }
1090
1091         /* Drop blacklisted entries and entries older than last_seq: */
1092         list_for_each_entry_safe(i, t, list, list) {
1093                 if (i->ignore)
1094                         continue;
1095
1096                 seq = le64_to_cpu(i->j.seq);
1097                 if (seq < last_seq) {
1098                         journal_replay_free(c, i);
1099                         continue;
1100                 }
1101
1102                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1103                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1104                                     "found blacklisted journal entry %llu", seq);
1105
1106                         journal_replay_free(c, i);
1107                 }
1108         }
1109
1110         /* Check for missing entries: */
1111         seq = last_seq;
1112         list_for_each_entry(i, list, list) {
1113                 if (i->ignore)
1114                         continue;
1115
1116                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1117
1118                 while (seq < le64_to_cpu(i->j.seq)) {
1119                         u64 missing_start, missing_end;
1120                         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1121
1122                         while (seq < le64_to_cpu(i->j.seq) &&
1123                                bch2_journal_seq_is_blacklisted(c, seq, false))
1124                                 seq++;
1125
1126                         if (seq == le64_to_cpu(i->j.seq))
1127                                 break;
1128
1129                         missing_start = seq;
1130
1131                         while (seq < le64_to_cpu(i->j.seq) &&
1132                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1133                                 seq++;
1134
1135                         if (i->list.prev != list) {
1136                                 struct journal_replay *p = list_prev_entry(i, list);
1137
1138                                 bch2_journal_ptrs_to_text(&buf1, c, p);
1139                                 pr_buf(&buf1, " size %zu", vstruct_sectors(&p->j, c->block_bits));
1140                         } else
1141                                 pr_buf(&buf1, "(none)");
1142                         bch2_journal_ptrs_to_text(&buf2, c, i);
1143
1144                         missing_end = seq - 1;
1145                         fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1146                                  "  prev at %s\n"
1147                                  "  next at %s",
1148                                  missing_start, missing_end,
1149                                  last_seq, *blacklist_seq - 1,
1150                                  buf1.buf, buf2.buf);
1151
1152                         printbuf_exit(&buf1);
1153                         printbuf_exit(&buf2);
1154                 }
1155
1156                 seq++;
1157         }
1158
1159         list_for_each_entry(i, list, list) {
1160                 struct jset_entry *entry;
1161                 struct bkey_i *k, *_n;
1162                 struct bch_replicas_padded replicas = {
1163                         .e.data_type = BCH_DATA_journal,
1164                         .e.nr_required = 1,
1165                 };
1166                 unsigned ptr;
1167
1168                 if (i->ignore)
1169                         continue;
1170
1171                 ret = jset_validate_entries(c, &i->j, READ);
1172                 if (ret)
1173                         goto err;
1174
1175                 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1176                         replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1177
1178                 bch2_replicas_entry_sort(&replicas.e);
1179
1180                 /*
1181                  * If we're mounting in degraded mode - if we didn't read all
1182                  * the devices - this is wrong:
1183                  */
1184
1185                 printbuf_reset(&buf);
1186                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1187
1188                 if (!degraded &&
1189                     (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
1190                      fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1191                                  "superblock not marked as containing replicas %s",
1192                                  buf.buf))) {
1193                         ret = bch2_mark_replicas(c, &replicas.e);
1194                         if (ret)
1195                                 goto err;
1196                 }
1197
1198                 for_each_jset_key(k, _n, entry, &i->j)
1199                         keys++;
1200                 entries++;
1201         }
1202
1203         bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
1204                  keys, entries, *start_seq);
1205
1206         if (*start_seq != *blacklist_seq)
1207                 bch_info(c, "dropped unflushed entries %llu-%llu",
1208                          *blacklist_seq, *start_seq - 1);
1209 err:
1210 fsck_err:
1211         printbuf_exit(&buf);
1212         return ret;
1213 }
1214
1215 /* journal write: */
1216
1217 static void __journal_write_alloc(struct journal *j,
1218                                   struct journal_buf *w,
1219                                   struct dev_alloc_list *devs_sorted,
1220                                   unsigned sectors,
1221                                   unsigned *replicas,
1222                                   unsigned replicas_want)
1223 {
1224         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1225         struct journal_device *ja;
1226         struct bch_dev *ca;
1227         unsigned i;
1228
1229         if (*replicas >= replicas_want)
1230                 return;
1231
1232         for (i = 0; i < devs_sorted->nr; i++) {
1233                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1234                 if (!ca)
1235                         continue;
1236
1237                 ja = &ca->journal;
1238
1239                 /*
1240                  * Check that we can use this device, and aren't already using
1241                  * it:
1242                  */
1243                 if (!ca->mi.durability ||
1244                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1245                     !ja->nr ||
1246                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
1247                                          ca->dev_idx) ||
1248                     sectors > ja->sectors_free)
1249                         continue;
1250
1251                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1252
1253                 bch2_bkey_append_ptr(&w->key,
1254                         (struct bch_extent_ptr) {
1255                                   .offset = bucket_to_sector(ca,
1256                                         ja->buckets[ja->cur_idx]) +
1257                                         ca->mi.bucket_size -
1258                                         ja->sectors_free,
1259                                   .dev = ca->dev_idx,
1260                 });
1261
1262                 ja->sectors_free -= sectors;
1263                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1264
1265                 *replicas += ca->mi.durability;
1266
1267                 if (*replicas >= replicas_want)
1268                         break;
1269         }
1270 }
1271
1272 /**
1273  * journal_next_bucket - move on to the next journal bucket if possible
1274  */
1275 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1276                                unsigned sectors)
1277 {
1278         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1279         struct bch_devs_mask devs;
1280         struct journal_device *ja;
1281         struct bch_dev *ca;
1282         struct dev_alloc_list devs_sorted;
1283         unsigned target = c->opts.metadata_target ?:
1284                 c->opts.foreground_target;
1285         unsigned i, replicas = 0, replicas_want =
1286                 READ_ONCE(c->opts.metadata_replicas);
1287
1288         rcu_read_lock();
1289 retry:
1290         devs = target_rw_devs(c, BCH_DATA_journal, target);
1291
1292         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1293
1294         __journal_write_alloc(j, w, &devs_sorted,
1295                               sectors, &replicas, replicas_want);
1296
1297         if (replicas >= replicas_want)
1298                 goto done;
1299
1300         for (i = 0; i < devs_sorted.nr; i++) {
1301                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1302                 if (!ca)
1303                         continue;
1304
1305                 ja = &ca->journal;
1306
1307                 if (sectors > ja->sectors_free &&
1308                     sectors <= ca->mi.bucket_size &&
1309                     bch2_journal_dev_buckets_available(j, ja,
1310                                         journal_space_discarded)) {
1311                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1312                         ja->sectors_free = ca->mi.bucket_size;
1313
1314                         /*
1315                          * ja->bucket_seq[ja->cur_idx] must always have
1316                          * something sensible:
1317                          */
1318                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1319                 }
1320         }
1321
1322         __journal_write_alloc(j, w, &devs_sorted,
1323                               sectors, &replicas, replicas_want);
1324
1325         if (replicas < replicas_want && target) {
1326                 /* Retry from all devices: */
1327                 target = 0;
1328                 goto retry;
1329         }
1330 done:
1331         rcu_read_unlock();
1332
1333         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1334
1335         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1336 }
1337
1338 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1339 {
1340         /* we aren't holding j->lock: */
1341         unsigned new_size = READ_ONCE(j->buf_size_want);
1342         void *new_buf;
1343
1344         if (buf->buf_size >= new_size)
1345                 return;
1346
1347         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
1348         if (!new_buf)
1349                 return;
1350
1351         memcpy(new_buf, buf->data, buf->buf_size);
1352
1353         spin_lock(&j->lock);
1354         swap(buf->data,         new_buf);
1355         swap(buf->buf_size,     new_size);
1356         spin_unlock(&j->lock);
1357
1358         kvpfree(new_buf, new_size);
1359 }
1360
1361 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1362 {
1363         return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1364 }
1365
1366 static void journal_write_done(struct closure *cl)
1367 {
1368         struct journal *j = container_of(cl, struct journal, io);
1369         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1370         struct journal_buf *w = journal_last_unwritten_buf(j);
1371         struct bch_replicas_padded replicas;
1372         union journal_res_state old, new;
1373         u64 v, seq;
1374         int err = 0;
1375
1376         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1377                                ? j->flush_write_time
1378                                : j->noflush_write_time, j->write_start_time);
1379
1380         if (!w->devs_written.nr) {
1381                 bch_err(c, "unable to write journal to sufficient devices");
1382                 err = -EIO;
1383         } else {
1384                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1385                                          w->devs_written);
1386                 if (bch2_mark_replicas(c, &replicas.e))
1387                         err = -EIO;
1388         }
1389
1390         if (err)
1391                 bch2_fatal_error(c);
1392
1393         spin_lock(&j->lock);
1394         seq = le64_to_cpu(w->data->seq);
1395
1396         if (seq >= j->pin.front)
1397                 journal_seq_pin(j, seq)->devs = w->devs_written;
1398
1399         if (!err) {
1400                 if (!JSET_NO_FLUSH(w->data)) {
1401                         j->flushed_seq_ondisk = seq;
1402                         j->last_seq_ondisk = w->last_seq;
1403
1404                         bch2_do_discards(c);
1405                         closure_wake_up(&c->freelist_wait);
1406                 }
1407         } else if (!j->err_seq || seq < j->err_seq)
1408                 j->err_seq      = seq;
1409
1410         j->seq_ondisk           = seq;
1411
1412         /*
1413          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1414          * more buckets:
1415          *
1416          * Must come before signaling write completion, for
1417          * bch2_fs_journal_stop():
1418          */
1419         journal_reclaim_kick(&c->journal);
1420
1421         /* also must come before signalling write completion: */
1422         closure_debug_destroy(cl);
1423
1424         v = atomic64_read(&j->reservations.counter);
1425         do {
1426                 old.v = new.v = v;
1427                 BUG_ON(journal_state_count(new, new.unwritten_idx));
1428
1429                 new.unwritten_idx++;
1430         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1431                                        old.v, new.v)) != old.v);
1432
1433         bch2_journal_space_available(j);
1434
1435         closure_wake_up(&w->wait);
1436         journal_wake(j);
1437
1438         if (!journal_state_count(new, new.unwritten_idx) &&
1439             journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1440                 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1441         } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1442                    new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1443                 struct journal_buf *buf = journal_cur_buf(j);
1444                 long delta = buf->expires - jiffies;
1445
1446                 /*
1447                  * We don't close a journal entry to write it while there's
1448                  * previous entries still in flight - the current journal entry
1449                  * might want to be written now:
1450                  */
1451
1452                 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
1453         }
1454
1455         spin_unlock(&j->lock);
1456 }
1457
1458 static void journal_write_endio(struct bio *bio)
1459 {
1460         struct bch_dev *ca = bio->bi_private;
1461         struct journal *j = &ca->fs->journal;
1462         struct journal_buf *w = journal_last_unwritten_buf(j);
1463         unsigned long flags;
1464
1465         if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1466                                le64_to_cpu(w->data->seq),
1467                                bch2_blk_status_to_str(bio->bi_status)) ||
1468             bch2_meta_write_fault("journal")) {
1469                 spin_lock_irqsave(&j->err_lock, flags);
1470                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1471                 spin_unlock_irqrestore(&j->err_lock, flags);
1472         }
1473
1474         closure_put(&j->io);
1475         percpu_ref_put(&ca->io_ref);
1476 }
1477
1478 static void do_journal_write(struct closure *cl)
1479 {
1480         struct journal *j = container_of(cl, struct journal, io);
1481         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1482         struct bch_dev *ca;
1483         struct journal_buf *w = journal_last_unwritten_buf(j);
1484         struct bch_extent_ptr *ptr;
1485         struct bio *bio;
1486         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1487
1488         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1489                 ca = bch_dev_bkey_exists(c, ptr->dev);
1490                 if (!percpu_ref_tryget(&ca->io_ref)) {
1491                         /* XXX: fix this */
1492                         bch_err(c, "missing device for journal write\n");
1493                         continue;
1494                 }
1495
1496                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1497                              sectors);
1498
1499                 bio = ca->journal.bio;
1500                 bio_reset(bio);
1501                 bio_set_dev(bio, ca->disk_sb.bdev);
1502                 bio->bi_iter.bi_sector  = ptr->offset;
1503                 bio->bi_end_io          = journal_write_endio;
1504                 bio->bi_private         = ca;
1505                 bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
1506
1507                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1508                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1509
1510                 if (!JSET_NO_FLUSH(w->data))
1511                         bio->bi_opf    |= REQ_FUA;
1512                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1513                         bio->bi_opf    |= REQ_PREFLUSH;
1514
1515                 bch2_bio_map(bio, w->data, sectors << 9);
1516
1517                 trace_journal_write(bio);
1518                 closure_bio_submit(bio, cl);
1519
1520                 ca->journal.bucket_seq[ca->journal.cur_idx] =
1521                         le64_to_cpu(w->data->seq);
1522         }
1523
1524         continue_at(cl, journal_write_done, c->io_complete_wq);
1525         return;
1526 }
1527
1528 void bch2_journal_write(struct closure *cl)
1529 {
1530         struct journal *j = container_of(cl, struct journal, io);
1531         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1532         struct bch_dev *ca;
1533         struct journal_buf *w = journal_last_unwritten_buf(j);
1534         struct jset_entry *start, *end;
1535         struct jset *jset;
1536         struct bio *bio;
1537         struct printbuf journal_debug_buf = PRINTBUF;
1538         bool validate_before_checksum = false;
1539         unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
1540         int ret;
1541
1542         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1543
1544         journal_buf_realloc(j, w);
1545         jset = w->data;
1546
1547         j->write_start_time = local_clock();
1548
1549         spin_lock(&j->lock);
1550         if (bch2_journal_error(j) ||
1551             w->noflush ||
1552             (!w->must_flush &&
1553              (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1554              test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1555                 w->noflush = true;
1556                 SET_JSET_NO_FLUSH(jset, true);
1557                 jset->last_seq  = 0;
1558                 w->last_seq     = 0;
1559
1560                 j->nr_noflush_writes++;
1561         } else {
1562                 j->last_flush_write = jiffies;
1563                 j->nr_flush_writes++;
1564         }
1565         spin_unlock(&j->lock);
1566
1567         /*
1568          * New btree roots are set by journalling them; when the journal entry
1569          * gets written we have to propagate them to c->btree_roots
1570          *
1571          * But, every journal entry we write has to contain all the btree roots
1572          * (at least for now); so after we copy btree roots to c->btree_roots we
1573          * have to get any missing btree roots and add them to this journal
1574          * entry:
1575          */
1576
1577         bch2_journal_entries_to_btree_roots(c, jset);
1578
1579         start = end = vstruct_last(jset);
1580
1581         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1582
1583         bch2_journal_super_entries_add_common(c, &end,
1584                                 le64_to_cpu(jset->seq));
1585         u64s    = (u64 *) end - (u64 *) start;
1586         BUG_ON(u64s > j->entry_u64s_reserved);
1587
1588         le32_add_cpu(&jset->u64s, u64s);
1589         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1590
1591         jset->magic             = cpu_to_le64(jset_magic(c));
1592         jset->version           = c->sb.version < bcachefs_metadata_version_bkey_renumber
1593                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1594                 : cpu_to_le32(c->sb.version);
1595
1596         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1597         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1598
1599         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1600                 j->last_empty_seq = le64_to_cpu(jset->seq);
1601
1602         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1603                 validate_before_checksum = true;
1604
1605         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1606                 validate_before_checksum = true;
1607
1608         if (validate_before_checksum &&
1609             jset_validate_for_write(c, jset))
1610                 goto err;
1611
1612         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1613                     jset->encrypted_start,
1614                     vstruct_end(jset) - (void *) jset->encrypted_start);
1615         if (bch2_fs_fatal_err_on(ret, c,
1616                         "error decrypting journal entry: %i", ret))
1617                 goto err;
1618
1619         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1620                                   journal_nonce(jset), jset);
1621
1622         if (!validate_before_checksum &&
1623             jset_validate_for_write(c, jset))
1624                 goto err;
1625
1626         sectors = vstruct_sectors(jset, c->block_bits);
1627         BUG_ON(sectors > w->sectors);
1628
1629         bytes = vstruct_bytes(jset);
1630         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1631
1632 retry_alloc:
1633         spin_lock(&j->lock);
1634         ret = journal_write_alloc(j, w, sectors);
1635
1636         if (ret && j->can_discard) {
1637                 spin_unlock(&j->lock);
1638                 bch2_journal_do_discards(j);
1639                 goto retry_alloc;
1640         }
1641
1642         if (ret)
1643                 __bch2_journal_debug_to_text(&journal_debug_buf, j);
1644
1645         /*
1646          * write is allocated, no longer need to account for it in
1647          * bch2_journal_space_available():
1648          */
1649         w->sectors = 0;
1650
1651         /*
1652          * journal entry has been compacted and allocated, recalculate space
1653          * available:
1654          */
1655         bch2_journal_space_available(j);
1656         spin_unlock(&j->lock);
1657
1658         if (ret) {
1659                 bch_err(c, "Unable to allocate journal write:\n%s",
1660                         journal_debug_buf.buf);
1661                 printbuf_exit(&journal_debug_buf);
1662                 bch2_fatal_error(c);
1663                 continue_at(cl, journal_write_done, c->io_complete_wq);
1664                 return;
1665         }
1666
1667         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1668
1669         if (c->opts.nochanges)
1670                 goto no_io;
1671
1672         for_each_rw_member(ca, c, i)
1673                 nr_rw_members++;
1674
1675         if (nr_rw_members > 1)
1676                 w->separate_flush = true;
1677
1678         if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1679                 for_each_rw_member(ca, c, i) {
1680                         percpu_ref_get(&ca->io_ref);
1681
1682                         bio = ca->journal.bio;
1683                         bio_reset(bio);
1684                         bio_set_dev(bio, ca->disk_sb.bdev);
1685                         bio->bi_opf             = REQ_OP_FLUSH;
1686                         bio->bi_end_io          = journal_write_endio;
1687                         bio->bi_private         = ca;
1688                         closure_bio_submit(bio, cl);
1689                 }
1690         }
1691
1692         continue_at(cl, do_journal_write, c->io_complete_wq);
1693         return;
1694 no_io:
1695         continue_at(cl, journal_write_done, c->io_complete_wq);
1696         return;
1697 err:
1698         bch2_fatal_error(c);
1699         continue_at(cl, journal_write_done, c->io_complete_wq);
1700 }