]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c
Update bcachefs sources to fad6d13aa5 fixup! bcachefs: Add persistent counters
[bcachefs-tools-debian] / libbcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "buckets.h"
8 #include "checksum.h"
9 #include "disk_groups.h"
10 #include "error.h"
11 #include "io.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17
18 #include <trace/events/bcachefs.h>
19
20 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
21 {
22         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
23 }
24
25 static void __journal_replay_free(struct bch_fs *c,
26                                   struct journal_replay *i)
27 {
28         struct journal_replay **p =
29                 genradix_ptr(&c->journal_entries,
30                              journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
31
32         BUG_ON(*p != i);
33         *p = NULL;
34         kvpfree(i, offsetof(struct journal_replay, j) +
35                 vstruct_bytes(&i->j));
36 }
37
38 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
39 {
40         i->ignore = true;
41
42         if (!c->opts.read_entire_journal)
43                 __journal_replay_free(c, i);
44 }
45
46 struct journal_list {
47         struct closure          cl;
48         u64                     last_seq;
49         struct mutex            lock;
50         int                     ret;
51 };
52
53 #define JOURNAL_ENTRY_ADD_OK            0
54 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
55
56 /*
57  * Given a journal entry we just read, add it to the list of journal entries to
58  * be replayed:
59  */
60 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
61                              struct journal_ptr entry_ptr,
62                              struct journal_list *jlist, struct jset *j,
63                              bool bad)
64 {
65         struct genradix_iter iter;
66         struct journal_replay **_i, *i, *dup;
67         struct journal_ptr *ptr;
68         size_t bytes = vstruct_bytes(j);
69         u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
70         int ret = JOURNAL_ENTRY_ADD_OK;
71
72         /* Is this entry older than the range we need? */
73         if (!c->opts.read_entire_journal &&
74             le64_to_cpu(j->seq) < jlist->last_seq)
75                 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
76
77         /*
78          * genradixes are indexed by a ulong, not a u64, so we can't index them
79          * by sequence number directly: Assume instead that they will all fall
80          * within the range of +-2billion of the filrst one we find.
81          */
82         if (!c->journal_entries_base_seq)
83                 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
84
85         /* Drop entries we don't need anymore */
86         if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
87                 genradix_for_each_from(&c->journal_entries, iter, _i,
88                                        journal_entry_radix_idx(c, jlist->last_seq)) {
89                         i = *_i;
90
91                         if (!i || i->ignore)
92                                 continue;
93
94                         if (le64_to_cpu(i->j.seq) >= last_seq)
95                                 break;
96                         journal_replay_free(c, i);
97                 }
98         }
99
100         jlist->last_seq = max(jlist->last_seq, last_seq);
101
102         _i = genradix_ptr_alloc(&c->journal_entries,
103                                 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
104                                 GFP_KERNEL);
105         if (!_i)
106                 return -ENOMEM;
107
108         /*
109          * Duplicate journal entries? If so we want the one that didn't have a
110          * checksum error:
111          */
112         dup = *_i;
113         if (dup) {
114                 if (dup->bad) {
115                         /* we'll replace @dup: */
116                 } else if (bad) {
117                         i = dup;
118                         goto found;
119                 } else {
120                         fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
121                                     memcmp(j, &dup->j, bytes), c,
122                                     "found duplicate but non identical journal entries (seq %llu)",
123                                     le64_to_cpu(j->seq));
124                         i = dup;
125                         goto found;
126                 }
127         }
128
129         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
130         if (!i)
131                 return -ENOMEM;
132
133         i->nr_ptrs       = 0;
134         i->bad          = bad;
135         i->ignore       = false;
136         memcpy(&i->j, j, bytes);
137
138         if (dup) {
139                 i->nr_ptrs = dup->nr_ptrs;
140                 memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
141                 __journal_replay_free(c, dup);
142         }
143
144
145         *_i = i;
146 found:
147         for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
148                 if (ptr->dev == ca->dev_idx) {
149                         bch_err(c, "duplicate journal entry %llu on same device",
150                                 le64_to_cpu(i->j.seq));
151                         goto out;
152                 }
153         }
154
155         if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
156                 bch_err(c, "found too many copies of journal entry %llu",
157                         le64_to_cpu(i->j.seq));
158                 goto out;
159         }
160
161         i->ptrs[i->nr_ptrs++] = entry_ptr;
162 out:
163 fsck_err:
164         return ret;
165 }
166
167 static struct nonce journal_nonce(const struct jset *jset)
168 {
169         return (struct nonce) {{
170                 [0] = 0,
171                 [1] = ((__le32 *) &jset->seq)[0],
172                 [2] = ((__le32 *) &jset->seq)[1],
173                 [3] = BCH_NONCE_JOURNAL,
174         }};
175 }
176
177 /* this fills in a range with empty jset_entries: */
178 static void journal_entry_null_range(void *start, void *end)
179 {
180         struct jset_entry *entry;
181
182         for (entry = start; entry != end; entry = vstruct_next(entry))
183                 memset(entry, 0, sizeof(*entry));
184 }
185
186 #define JOURNAL_ENTRY_REREAD    5
187 #define JOURNAL_ENTRY_NONE      6
188 #define JOURNAL_ENTRY_BAD       7
189
190 #define journal_entry_err(c, msg, ...)                                  \
191 ({                                                                      \
192         switch (write) {                                                \
193         case READ:                                                      \
194                 mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
195                 break;                                                  \
196         case WRITE:                                                     \
197                 bch_err(c, "corrupt metadata before write:\n"           \
198                         msg, ##__VA_ARGS__);                            \
199                 if (bch2_fs_inconsistent(c)) {                          \
200                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
201                         goto fsck_err;                                  \
202                 }                                                       \
203                 break;                                                  \
204         }                                                               \
205         true;                                                           \
206 })
207
208 #define journal_entry_err_on(cond, c, msg, ...)                         \
209         ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
210
211 #define FSCK_DELETED_KEY        5
212
213 static int journal_validate_key(struct bch_fs *c, const char *where,
214                                 struct jset_entry *entry,
215                                 unsigned level, enum btree_id btree_id,
216                                 struct bkey_i *k, const char *type,
217                                 unsigned version, int big_endian, int write)
218 {
219         void *next = vstruct_next(entry);
220         struct printbuf buf = PRINTBUF;
221         int ret = 0;
222
223         if (journal_entry_err_on(!k->k.u64s, c,
224                         "invalid %s in %s entry offset %zi/%u: k->u64s 0",
225                         type, where,
226                         (u64 *) k - entry->_data,
227                         le16_to_cpu(entry->u64s))) {
228                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
229                 journal_entry_null_range(vstruct_next(entry), next);
230                 return FSCK_DELETED_KEY;
231         }
232
233         if (journal_entry_err_on((void *) bkey_next(k) >
234                                 (void *) vstruct_next(entry), c,
235                         "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
236                         type, where,
237                         (u64 *) k - entry->_data,
238                         le16_to_cpu(entry->u64s))) {
239                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
240                 journal_entry_null_range(vstruct_next(entry), next);
241                 return FSCK_DELETED_KEY;
242         }
243
244         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
245                         "invalid %s in %s entry offset %zi/%u: bad format %u",
246                         type, where,
247                         (u64 *) k - entry->_data,
248                         le16_to_cpu(entry->u64s),
249                         k->k.format)) {
250                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
251                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
252                 journal_entry_null_range(vstruct_next(entry), next);
253                 return FSCK_DELETED_KEY;
254         }
255
256         if (!write)
257                 bch2_bkey_compat(level, btree_id, version, big_endian,
258                                  write, NULL, bkey_to_packed(k));
259
260         if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
261                               __btree_node_type(level, btree_id), write, &buf)) {
262                 printbuf_reset(&buf);
263                 pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
264                        type, where,
265                        (u64 *) k - entry->_data,
266                        le16_to_cpu(entry->u64s));
267                 pr_newline(&buf);
268                 pr_indent_push(&buf, 2);
269
270                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
271                 pr_newline(&buf);
272                 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
273                                   __btree_node_type(level, btree_id), write, &buf);
274
275                 mustfix_fsck_err(c, "%s", buf.buf);
276
277                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
278                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
279                 journal_entry_null_range(vstruct_next(entry), next);
280
281                 printbuf_exit(&buf);
282                 return FSCK_DELETED_KEY;
283         }
284
285         if (write)
286                 bch2_bkey_compat(level, btree_id, version, big_endian,
287                                  write, NULL, bkey_to_packed(k));
288 fsck_err:
289         printbuf_exit(&buf);
290         return ret;
291 }
292
293 static int journal_entry_btree_keys_validate(struct bch_fs *c,
294                                              const char *where,
295                                              struct jset_entry *entry,
296                                              unsigned version, int big_endian, int write)
297 {
298         struct bkey_i *k = entry->start;
299
300         while (k != vstruct_last(entry)) {
301                 int ret = journal_validate_key(c, where, entry,
302                                                entry->level,
303                                                entry->btree_id,
304                                                k, "key", version, big_endian, write);
305                 if (ret == FSCK_DELETED_KEY)
306                         continue;
307
308                 k = bkey_next(k);
309         }
310
311         return 0;
312 }
313
314 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
315                                              struct jset_entry *entry)
316 {
317         struct bkey_i *k;
318         bool first = true;
319
320         vstruct_for_each(entry, k) {
321                 if (!first) {
322                         pr_newline(out);
323                         pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
324                 }
325                 pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
326                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
327                 first = false;
328         }
329 }
330
331 static int journal_entry_btree_root_validate(struct bch_fs *c,
332                                              const char *where,
333                                              struct jset_entry *entry,
334                                              unsigned version, int big_endian, int write)
335 {
336         struct bkey_i *k = entry->start;
337         int ret = 0;
338
339         if (journal_entry_err_on(!entry->u64s ||
340                                  le16_to_cpu(entry->u64s) != k->k.u64s, c,
341                                  "invalid btree root journal entry: wrong number of keys")) {
342                 void *next = vstruct_next(entry);
343                 /*
344                  * we don't want to null out this jset_entry,
345                  * just the contents, so that later we can tell
346                  * we were _supposed_ to have a btree root
347                  */
348                 entry->u64s = 0;
349                 journal_entry_null_range(vstruct_next(entry), next);
350                 return 0;
351         }
352
353         return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
354                                     "btree root", version, big_endian, write);
355 fsck_err:
356         return ret;
357 }
358
359 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
360                                              struct jset_entry *entry)
361 {
362         journal_entry_btree_keys_to_text(out, c, entry);
363 }
364
365 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
366                                             const char *where,
367                                             struct jset_entry *entry,
368                                             unsigned version, int big_endian, int write)
369 {
370         /* obsolete, don't care: */
371         return 0;
372 }
373
374 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
375                                             struct jset_entry *entry)
376 {
377 }
378
379 static int journal_entry_blacklist_validate(struct bch_fs *c,
380                                             const char *where,
381                                             struct jset_entry *entry,
382                                             unsigned version, int big_endian, int write)
383 {
384         int ret = 0;
385
386         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
387                 "invalid journal seq blacklist entry: bad size")) {
388                 journal_entry_null_range(entry, vstruct_next(entry));
389         }
390 fsck_err:
391         return ret;
392 }
393
394 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
395                                             struct jset_entry *entry)
396 {
397         struct jset_entry_blacklist *bl =
398                 container_of(entry, struct jset_entry_blacklist, entry);
399
400         pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
401 }
402
403 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
404                                                const char *where,
405                                                struct jset_entry *entry,
406                                                unsigned version, int big_endian, int write)
407 {
408         struct jset_entry_blacklist_v2 *bl_entry;
409         int ret = 0;
410
411         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
412                 "invalid journal seq blacklist entry: bad size")) {
413                 journal_entry_null_range(entry, vstruct_next(entry));
414                 goto out;
415         }
416
417         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
418
419         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
420                                  le64_to_cpu(bl_entry->end), c,
421                 "invalid journal seq blacklist entry: start > end")) {
422                 journal_entry_null_range(entry, vstruct_next(entry));
423         }
424 out:
425 fsck_err:
426         return ret;
427 }
428
429 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
430                                                struct jset_entry *entry)
431 {
432         struct jset_entry_blacklist_v2 *bl =
433                 container_of(entry, struct jset_entry_blacklist_v2, entry);
434
435         pr_buf(out, "start=%llu end=%llu",
436                le64_to_cpu(bl->start),
437                le64_to_cpu(bl->end));
438 }
439
440 static int journal_entry_usage_validate(struct bch_fs *c,
441                                         const char *where,
442                                         struct jset_entry *entry,
443                                         unsigned version, int big_endian, int write)
444 {
445         struct jset_entry_usage *u =
446                 container_of(entry, struct jset_entry_usage, entry);
447         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
448         int ret = 0;
449
450         if (journal_entry_err_on(bytes < sizeof(*u),
451                                  c,
452                                  "invalid journal entry usage: bad size")) {
453                 journal_entry_null_range(entry, vstruct_next(entry));
454                 return ret;
455         }
456
457 fsck_err:
458         return ret;
459 }
460
461 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
462                                         struct jset_entry *entry)
463 {
464         struct jset_entry_usage *u =
465                 container_of(entry, struct jset_entry_usage, entry);
466
467         pr_buf(out, "type=%s v=%llu",
468                bch2_fs_usage_types[u->entry.btree_id],
469                le64_to_cpu(u->v));
470 }
471
472 static int journal_entry_data_usage_validate(struct bch_fs *c,
473                                         const char *where,
474                                         struct jset_entry *entry,
475                                         unsigned version, int big_endian, int write)
476 {
477         struct jset_entry_data_usage *u =
478                 container_of(entry, struct jset_entry_data_usage, entry);
479         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
480         int ret = 0;
481
482         if (journal_entry_err_on(bytes < sizeof(*u) ||
483                                  bytes < sizeof(*u) + u->r.nr_devs,
484                                  c,
485                                  "invalid journal entry usage: bad size")) {
486                 journal_entry_null_range(entry, vstruct_next(entry));
487                 return ret;
488         }
489
490 fsck_err:
491         return ret;
492 }
493
494 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
495                                              struct jset_entry *entry)
496 {
497         struct jset_entry_data_usage *u =
498                 container_of(entry, struct jset_entry_data_usage, entry);
499
500         bch2_replicas_entry_to_text(out, &u->r);
501         pr_buf(out, "=%llu", le64_to_cpu(u->v));
502 }
503
504 static int journal_entry_clock_validate(struct bch_fs *c,
505                                         const char *where,
506                                         struct jset_entry *entry,
507                                         unsigned version, int big_endian, int write)
508 {
509         struct jset_entry_clock *clock =
510                 container_of(entry, struct jset_entry_clock, entry);
511         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
512         int ret = 0;
513
514         if (journal_entry_err_on(bytes != sizeof(*clock),
515                                  c, "invalid journal entry clock: bad size")) {
516                 journal_entry_null_range(entry, vstruct_next(entry));
517                 return ret;
518         }
519
520         if (journal_entry_err_on(clock->rw > 1,
521                                  c, "invalid journal entry clock: bad rw")) {
522                 journal_entry_null_range(entry, vstruct_next(entry));
523                 return ret;
524         }
525
526 fsck_err:
527         return ret;
528 }
529
530 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
531                                         struct jset_entry *entry)
532 {
533         struct jset_entry_clock *clock =
534                 container_of(entry, struct jset_entry_clock, entry);
535
536         pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
537 }
538
539 static int journal_entry_dev_usage_validate(struct bch_fs *c,
540                                             const char *where,
541                                             struct jset_entry *entry,
542                                             unsigned version, int big_endian, int write)
543 {
544         struct jset_entry_dev_usage *u =
545                 container_of(entry, struct jset_entry_dev_usage, entry);
546         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
547         unsigned expected = sizeof(*u);
548         unsigned dev;
549         int ret = 0;
550
551         if (journal_entry_err_on(bytes < expected,
552                                  c, "invalid journal entry dev usage: bad size (%u < %u)",
553                                  bytes, expected)) {
554                 journal_entry_null_range(entry, vstruct_next(entry));
555                 return ret;
556         }
557
558         dev = le32_to_cpu(u->dev);
559
560         if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
561                                  c, "invalid journal entry dev usage: bad dev")) {
562                 journal_entry_null_range(entry, vstruct_next(entry));
563                 return ret;
564         }
565
566         if (journal_entry_err_on(u->pad,
567                                  c, "invalid journal entry dev usage: bad pad")) {
568                 journal_entry_null_range(entry, vstruct_next(entry));
569                 return ret;
570         }
571
572 fsck_err:
573         return ret;
574 }
575
576 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
577                                             struct jset_entry *entry)
578 {
579         struct jset_entry_dev_usage *u =
580                 container_of(entry, struct jset_entry_dev_usage, entry);
581         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
582
583         pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
584
585         for (i = 0; i < nr_types; i++) {
586                 if (i < BCH_DATA_NR)
587                         pr_buf(out, " %s", bch2_data_types[i]);
588                 else
589                         pr_buf(out, " (unknown data type %u)", i);
590                 pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
591                        le64_to_cpu(u->d[i].buckets),
592                        le64_to_cpu(u->d[i].sectors),
593                        le64_to_cpu(u->d[i].fragmented));
594         }
595
596         pr_buf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
597 }
598
599 static int journal_entry_log_validate(struct bch_fs *c,
600                                       const char *where,
601                                       struct jset_entry *entry,
602                                       unsigned version, int big_endian, int write)
603 {
604         return 0;
605 }
606
607 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
608                                       struct jset_entry *entry)
609 {
610         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
611         unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
612
613         pr_buf(out, "%.*s", bytes, l->d);
614 }
615
616 struct jset_entry_ops {
617         int (*validate)(struct bch_fs *, const char *,
618                         struct jset_entry *, unsigned, int, int);
619         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
620 };
621
622 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
623 #define x(f, nr)                                                \
624         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
625                 .validate       = journal_entry_##f##_validate, \
626                 .to_text        = journal_entry_##f##_to_text,  \
627         },
628         BCH_JSET_ENTRY_TYPES()
629 #undef x
630 };
631
632 int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
633                                 struct jset_entry *entry,
634                                 unsigned version, int big_endian, int write)
635 {
636         return entry->type < BCH_JSET_ENTRY_NR
637                 ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
638                                 version, big_endian, write)
639                 : 0;
640 }
641
642 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
643                                 struct jset_entry *entry)
644 {
645         if (entry->type < BCH_JSET_ENTRY_NR) {
646                 pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
647                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
648         } else {
649                 pr_buf(out, "(unknown type %u)", entry->type);
650         }
651 }
652
653 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
654                                  int write)
655 {
656         char buf[100];
657         struct jset_entry *entry;
658         int ret = 0;
659
660         vstruct_for_each(jset, entry) {
661                 scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
662                           le64_to_cpu(jset->seq),
663                           (u64 *) entry - jset->_data,
664                           le32_to_cpu(jset->u64s));
665
666                 if (journal_entry_err_on(vstruct_next(entry) >
667                                          vstruct_last(jset), c,
668                                 "journal entry extends past end of jset")) {
669                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
670                         break;
671                 }
672
673                 ret = bch2_journal_entry_validate(c, buf, entry,
674                                         le32_to_cpu(jset->version),
675                                         JSET_BIG_ENDIAN(jset), write);
676                 if (ret)
677                         break;
678         }
679 fsck_err:
680         return ret;
681 }
682
683 static int jset_validate(struct bch_fs *c,
684                          struct bch_dev *ca,
685                          struct jset *jset, u64 sector,
686                          unsigned bucket_sectors_left,
687                          unsigned sectors_read,
688                          int write)
689 {
690         size_t bytes = vstruct_bytes(jset);
691         struct bch_csum csum;
692         unsigned version;
693         int ret = 0;
694
695         if (le64_to_cpu(jset->magic) != jset_magic(c))
696                 return JOURNAL_ENTRY_NONE;
697
698         version = le32_to_cpu(jset->version);
699         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
700                                   version < bcachefs_metadata_version_min) ||
701                                  version >= bcachefs_metadata_version_max, c,
702                         "%s sector %llu seq %llu: unknown journal entry version %u",
703                         ca ? ca->name : c->name,
704                         sector, le64_to_cpu(jset->seq),
705                         version)) {
706                 /* don't try to continue: */
707                 return EINVAL;
708         }
709
710         if (bytes > (sectors_read << 9) &&
711             sectors_read < bucket_sectors_left)
712                 return JOURNAL_ENTRY_REREAD;
713
714         if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
715                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
716                         ca ? ca->name : c->name,
717                         sector, le64_to_cpu(jset->seq), bytes)) {
718                 ret = JOURNAL_ENTRY_BAD;
719                 le32_add_cpu(&jset->u64s,
720                              -((bytes - (bucket_sectors_left << 9)) / 8));
721         }
722
723         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
724                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
725                         ca ? ca->name : c->name,
726                         sector, le64_to_cpu(jset->seq),
727                         JSET_CSUM_TYPE(jset))) {
728                 ret = JOURNAL_ENTRY_BAD;
729                 goto csum_done;
730         }
731
732         if (write)
733                 goto csum_done;
734
735         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
736         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
737                                  "%s sector %llu seq %llu: journal checksum bad",
738                                  ca ? ca->name : c->name,
739                                  sector, le64_to_cpu(jset->seq)))
740                 ret = JOURNAL_ENTRY_BAD;
741
742         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
743                      jset->encrypted_start,
744                      vstruct_end(jset) - (void *) jset->encrypted_start);
745         bch2_fs_fatal_err_on(ret, c,
746                         "error decrypting journal entry: %i", ret);
747 csum_done:
748         /* last_seq is ignored when JSET_NO_FLUSH is true */
749         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
750                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
751                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
752                                  le64_to_cpu(jset->last_seq),
753                                  le64_to_cpu(jset->seq))) {
754                 jset->last_seq = jset->seq;
755                 return JOURNAL_ENTRY_BAD;
756         }
757 fsck_err:
758         return ret;
759 }
760
761 static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
762 {
763         unsigned sectors = vstruct_sectors(jset, c->block_bits);
764
765         return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
766                 jset_validate_entries(c, jset, WRITE);
767 }
768
769 struct journal_read_buf {
770         void            *data;
771         size_t          size;
772 };
773
774 static int journal_read_buf_realloc(struct journal_read_buf *b,
775                                     size_t new_size)
776 {
777         void *n;
778
779         /* the bios are sized for this many pages, max: */
780         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
781                 return -ENOMEM;
782
783         new_size = roundup_pow_of_two(new_size);
784         n = kvpmalloc(new_size, GFP_KERNEL);
785         if (!n)
786                 return -ENOMEM;
787
788         kvpfree(b->data, b->size);
789         b->data = n;
790         b->size = new_size;
791         return 0;
792 }
793
794 static int journal_read_bucket(struct bch_dev *ca,
795                                struct journal_read_buf *buf,
796                                struct journal_list *jlist,
797                                unsigned bucket)
798 {
799         struct bch_fs *c = ca->fs;
800         struct journal_device *ja = &ca->journal;
801         struct jset *j = NULL;
802         unsigned sectors, sectors_read = 0;
803         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
804             end = offset + ca->mi.bucket_size;
805         bool saw_bad = false;
806         int ret = 0;
807
808         pr_debug("reading %u", bucket);
809
810         while (offset < end) {
811                 if (!sectors_read) {
812                         struct bio *bio;
813 reread:
814                         sectors_read = min_t(unsigned,
815                                 end - offset, buf->size >> 9);
816
817                         bio = bio_kmalloc(GFP_KERNEL,
818                                           buf_pages(buf->data,
819                                                     sectors_read << 9));
820                         bio_set_dev(bio, ca->disk_sb.bdev);
821                         bio->bi_iter.bi_sector  = offset;
822                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
823                         bch2_bio_map(bio, buf->data, sectors_read << 9);
824
825                         ret = submit_bio_wait(bio);
826                         bio_put(bio);
827
828                         if (bch2_dev_io_err_on(ret, ca,
829                                                "journal read error: sector %llu",
830                                                offset) ||
831                             bch2_meta_read_fault("journal")) {
832                                 /*
833                                  * We don't error out of the recovery process
834                                  * here, since the relevant journal entry may be
835                                  * found on a different device, and missing or
836                                  * no journal entries will be handled later
837                                  */
838                                 return 0;
839                         }
840
841                         j = buf->data;
842                 }
843
844                 ret = jset_validate(c, ca, j, offset,
845                                     end - offset, sectors_read,
846                                     READ);
847                 switch (ret) {
848                 case BCH_FSCK_OK:
849                         sectors = vstruct_sectors(j, c->block_bits);
850                         break;
851                 case JOURNAL_ENTRY_REREAD:
852                         if (vstruct_bytes(j) > buf->size) {
853                                 ret = journal_read_buf_realloc(buf,
854                                                         vstruct_bytes(j));
855                                 if (ret)
856                                         return ret;
857                         }
858                         goto reread;
859                 case JOURNAL_ENTRY_NONE:
860                         if (!saw_bad)
861                                 return 0;
862                         sectors = block_sectors(c);
863                         goto next_block;
864                 case JOURNAL_ENTRY_BAD:
865                         saw_bad = true;
866                         /*
867                          * On checksum error we don't really trust the size
868                          * field of the journal entry we read, so try reading
869                          * again at next block boundary:
870                          */
871                         sectors = block_sectors(c);
872                         break;
873                 default:
874                         return ret;
875                 }
876
877                 /*
878                  * This happens sometimes if we don't have discards on -
879                  * when we've partially overwritten a bucket with new
880                  * journal entries. We don't need the rest of the
881                  * bucket:
882                  */
883                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
884                         return 0;
885
886                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
887
888                 mutex_lock(&jlist->lock);
889                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
890                                         .dev            = ca->dev_idx,
891                                         .bucket         = bucket,
892                                         .bucket_offset  = offset -
893                                                 bucket_to_sector(ca, ja->buckets[bucket]),
894                                         .sector         = offset,
895                                         }, jlist, j, ret != 0);
896                 mutex_unlock(&jlist->lock);
897
898                 switch (ret) {
899                 case JOURNAL_ENTRY_ADD_OK:
900                         break;
901                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
902                         break;
903                 default:
904                         return ret;
905                 }
906 next_block:
907                 pr_debug("next");
908                 offset          += sectors;
909                 sectors_read    -= sectors;
910                 j = ((void *) j) + (sectors << 9);
911         }
912
913         return 0;
914 }
915
916 static void bch2_journal_read_device(struct closure *cl)
917 {
918         struct journal_device *ja =
919                 container_of(cl, struct journal_device, read);
920         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
921         struct bch_fs *c = ca->fs;
922         struct journal_list *jlist =
923                 container_of(cl->parent, struct journal_list, cl);
924         struct journal_replay *r, **_r;
925         struct genradix_iter iter;
926         struct journal_read_buf buf = { NULL, 0 };
927         u64 min_seq = U64_MAX;
928         unsigned i;
929         int ret = 0;
930
931         if (!ja->nr)
932                 goto out;
933
934         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
935         if (ret)
936                 goto err;
937
938         pr_debug("%u journal buckets", ja->nr);
939
940         for (i = 0; i < ja->nr; i++) {
941                 ret = journal_read_bucket(ca, &buf, jlist, i);
942                 if (ret)
943                         goto err;
944         }
945
946         /* Find the journal bucket with the highest sequence number: */
947         for (i = 0; i < ja->nr; i++) {
948                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
949                         ja->cur_idx = i;
950
951                 min_seq = min(ja->bucket_seq[i], min_seq);
952         }
953
954         /*
955          * If there's duplicate journal entries in multiple buckets (which
956          * definitely isn't supposed to happen, but...) - make sure to start
957          * cur_idx at the last of those buckets, so we don't deadlock trying to
958          * allocate
959          */
960         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
961                ja->bucket_seq[ja->cur_idx] ==
962                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
963                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
964
965         ja->sectors_free = ca->mi.bucket_size;
966
967         mutex_lock(&jlist->lock);
968         genradix_for_each(&c->journal_entries, iter, _r) {
969                 r = *_r;
970
971                 if (!r)
972                         continue;
973
974                 for (i = 0; i < r->nr_ptrs; i++) {
975                         if (r->ptrs[i].dev == ca->dev_idx &&
976                             sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
977                                 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
978                                         vstruct_sectors(&r->j, c->block_bits);
979
980                                 ja->sectors_free = min(ja->sectors_free,
981                                                        ca->mi.bucket_size - wrote);
982                         }
983                 }
984         }
985         mutex_unlock(&jlist->lock);
986
987         if (ja->bucket_seq[ja->cur_idx] &&
988             ja->sectors_free == ca->mi.bucket_size) {
989                 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
990                 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
991                 for (i = 0; i < 3; i++) {
992                         unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
993                         bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
994                 }
995                 ja->sectors_free = 0;
996         }
997
998         /*
999          * Set dirty_idx to indicate the entire journal is full and needs to be
1000          * reclaimed - journal reclaim will immediately reclaim whatever isn't
1001          * pinned when it first runs:
1002          */
1003         ja->discard_idx = ja->dirty_idx_ondisk =
1004                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1005 out:
1006         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1007         kvpfree(buf.data, buf.size);
1008         percpu_ref_put(&ca->io_ref);
1009         closure_return(cl);
1010         return;
1011 err:
1012         mutex_lock(&jlist->lock);
1013         jlist->ret = ret;
1014         mutex_unlock(&jlist->lock);
1015         goto out;
1016 }
1017
1018 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1019                                struct journal_replay *j)
1020 {
1021         unsigned i;
1022
1023         for (i = 0; i < j->nr_ptrs; i++) {
1024                 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
1025                 u64 offset;
1026
1027                 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
1028
1029                 if (i)
1030                         pr_buf(out, " ");
1031                 pr_buf(out, "%u:%u:%u (sector %llu)",
1032                        j->ptrs[i].dev,
1033                        j->ptrs[i].bucket,
1034                        j->ptrs[i].bucket_offset,
1035                        j->ptrs[i].sector);
1036         }
1037 }
1038
1039 int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
1040 {
1041         struct journal_list jlist;
1042         struct journal_replay *i, **_i, *prev = NULL;
1043         struct genradix_iter radix_iter;
1044         struct bch_dev *ca;
1045         unsigned iter;
1046         struct printbuf buf = PRINTBUF;
1047         size_t keys = 0, entries = 0;
1048         bool degraded = false;
1049         u64 seq, last_seq = 0;
1050         int ret = 0;
1051
1052         closure_init_stack(&jlist.cl);
1053         mutex_init(&jlist.lock);
1054         jlist.last_seq = 0;
1055         jlist.ret = 0;
1056
1057         for_each_member_device(ca, c, iter) {
1058                 if (!c->opts.fsck &&
1059                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1060                         continue;
1061
1062                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1063                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1064                     percpu_ref_tryget(&ca->io_ref))
1065                         closure_call(&ca->journal.read,
1066                                      bch2_journal_read_device,
1067                                      system_unbound_wq,
1068                                      &jlist.cl);
1069                 else
1070                         degraded = true;
1071         }
1072
1073         closure_sync(&jlist.cl);
1074
1075         if (jlist.ret)
1076                 return jlist.ret;
1077
1078         *start_seq = 0;
1079
1080         /*
1081          * Find most recent flush entry, and ignore newer non flush entries -
1082          * those entries will be blacklisted:
1083          */
1084         genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1085                 i = *_i;
1086
1087                 if (!i || i->ignore)
1088                         continue;
1089
1090                 if (!*start_seq)
1091                         *start_seq = le64_to_cpu(i->j.seq) + 1;
1092
1093                 if (!JSET_NO_FLUSH(&i->j)) {
1094                         last_seq        = le64_to_cpu(i->j.last_seq);
1095                         *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1096                         break;
1097                 }
1098
1099                 journal_replay_free(c, i);
1100         }
1101
1102         if (!*start_seq) {
1103                 bch_info(c, "journal read done, but no entries found");
1104                 return 0;
1105         }
1106
1107         if (!last_seq) {
1108                 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
1109                 ret = -1;
1110                 goto err;
1111         }
1112
1113         /* Drop blacklisted entries and entries older than last_seq: */
1114         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1115                 i = *_i;
1116
1117                 if (!i || i->ignore)
1118                         continue;
1119
1120                 seq = le64_to_cpu(i->j.seq);
1121                 if (seq < last_seq) {
1122                         journal_replay_free(c, i);
1123                         continue;
1124                 }
1125
1126                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1127                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1128                                     "found blacklisted journal entry %llu", seq);
1129
1130                         journal_replay_free(c, i);
1131                 }
1132         }
1133
1134         /* Check for missing entries: */
1135         seq = last_seq;
1136         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1137                 i = *_i;
1138
1139                 if (!i || i->ignore)
1140                         continue;
1141
1142                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1143
1144                 while (seq < le64_to_cpu(i->j.seq)) {
1145                         u64 missing_start, missing_end;
1146                         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1147
1148                         while (seq < le64_to_cpu(i->j.seq) &&
1149                                bch2_journal_seq_is_blacklisted(c, seq, false))
1150                                 seq++;
1151
1152                         if (seq == le64_to_cpu(i->j.seq))
1153                                 break;
1154
1155                         missing_start = seq;
1156
1157                         while (seq < le64_to_cpu(i->j.seq) &&
1158                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1159                                 seq++;
1160
1161                         if (prev) {
1162                                 bch2_journal_ptrs_to_text(&buf1, c, prev);
1163                                 pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1164                         } else
1165                                 pr_buf(&buf1, "(none)");
1166                         bch2_journal_ptrs_to_text(&buf2, c, i);
1167
1168                         missing_end = seq - 1;
1169                         fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1170                                  "  prev at %s\n"
1171                                  "  next at %s",
1172                                  missing_start, missing_end,
1173                                  last_seq, *blacklist_seq - 1,
1174                                  buf1.buf, buf2.buf);
1175
1176                         printbuf_exit(&buf1);
1177                         printbuf_exit(&buf2);
1178                 }
1179
1180                 prev = i;
1181                 seq++;
1182         }
1183
1184         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1185                 struct jset_entry *entry;
1186                 struct bkey_i *k, *_n;
1187                 struct bch_replicas_padded replicas = {
1188                         .e.data_type = BCH_DATA_journal,
1189                         .e.nr_required = 1,
1190                 };
1191                 unsigned ptr;
1192
1193                 i = *_i;
1194                 if (!i || i->ignore)
1195                         continue;
1196
1197                 ret = jset_validate_entries(c, &i->j, READ);
1198                 if (ret)
1199                         goto err;
1200
1201                 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1202                         replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1203
1204                 bch2_replicas_entry_sort(&replicas.e);
1205
1206                 /*
1207                  * If we're mounting in degraded mode - if we didn't read all
1208                  * the devices - this is wrong:
1209                  */
1210
1211                 printbuf_reset(&buf);
1212                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1213
1214                 if (!degraded &&
1215                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1216                                 "superblock not marked as containing replicas %s",
1217                                 buf.buf)) {
1218                         ret = bch2_mark_replicas(c, &replicas.e);
1219                         if (ret)
1220                                 goto err;
1221                 }
1222
1223                 for_each_jset_key(k, _n, entry, &i->j)
1224                         keys++;
1225                 entries++;
1226         }
1227
1228         bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
1229                  keys, entries, *start_seq);
1230
1231         if (*start_seq != *blacklist_seq)
1232                 bch_info(c, "dropped unflushed entries %llu-%llu",
1233                          *blacklist_seq, *start_seq - 1);
1234 err:
1235 fsck_err:
1236         printbuf_exit(&buf);
1237         return ret;
1238 }
1239
1240 /* journal write: */
1241
1242 static void __journal_write_alloc(struct journal *j,
1243                                   struct journal_buf *w,
1244                                   struct dev_alloc_list *devs_sorted,
1245                                   unsigned sectors,
1246                                   unsigned *replicas,
1247                                   unsigned replicas_want)
1248 {
1249         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1250         struct journal_device *ja;
1251         struct bch_dev *ca;
1252         unsigned i;
1253
1254         if (*replicas >= replicas_want)
1255                 return;
1256
1257         for (i = 0; i < devs_sorted->nr; i++) {
1258                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1259                 if (!ca)
1260                         continue;
1261
1262                 ja = &ca->journal;
1263
1264                 /*
1265                  * Check that we can use this device, and aren't already using
1266                  * it:
1267                  */
1268                 if (!ca->mi.durability ||
1269                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1270                     !ja->nr ||
1271                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
1272                                          ca->dev_idx) ||
1273                     sectors > ja->sectors_free)
1274                         continue;
1275
1276                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1277
1278                 bch2_bkey_append_ptr(&w->key,
1279                         (struct bch_extent_ptr) {
1280                                   .offset = bucket_to_sector(ca,
1281                                         ja->buckets[ja->cur_idx]) +
1282                                         ca->mi.bucket_size -
1283                                         ja->sectors_free,
1284                                   .dev = ca->dev_idx,
1285                 });
1286
1287                 ja->sectors_free -= sectors;
1288                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1289
1290                 *replicas += ca->mi.durability;
1291
1292                 if (*replicas >= replicas_want)
1293                         break;
1294         }
1295 }
1296
1297 /**
1298  * journal_next_bucket - move on to the next journal bucket if possible
1299  */
1300 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1301                                unsigned sectors)
1302 {
1303         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1304         struct bch_devs_mask devs;
1305         struct journal_device *ja;
1306         struct bch_dev *ca;
1307         struct dev_alloc_list devs_sorted;
1308         unsigned target = c->opts.metadata_target ?:
1309                 c->opts.foreground_target;
1310         unsigned i, replicas = 0, replicas_want =
1311                 READ_ONCE(c->opts.metadata_replicas);
1312
1313         rcu_read_lock();
1314 retry:
1315         devs = target_rw_devs(c, BCH_DATA_journal, target);
1316
1317         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1318
1319         __journal_write_alloc(j, w, &devs_sorted,
1320                               sectors, &replicas, replicas_want);
1321
1322         if (replicas >= replicas_want)
1323                 goto done;
1324
1325         for (i = 0; i < devs_sorted.nr; i++) {
1326                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1327                 if (!ca)
1328                         continue;
1329
1330                 ja = &ca->journal;
1331
1332                 if (sectors > ja->sectors_free &&
1333                     sectors <= ca->mi.bucket_size &&
1334                     bch2_journal_dev_buckets_available(j, ja,
1335                                         journal_space_discarded)) {
1336                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1337                         ja->sectors_free = ca->mi.bucket_size;
1338
1339                         /*
1340                          * ja->bucket_seq[ja->cur_idx] must always have
1341                          * something sensible:
1342                          */
1343                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1344                 }
1345         }
1346
1347         __journal_write_alloc(j, w, &devs_sorted,
1348                               sectors, &replicas, replicas_want);
1349
1350         if (replicas < replicas_want && target) {
1351                 /* Retry from all devices: */
1352                 target = 0;
1353                 goto retry;
1354         }
1355 done:
1356         rcu_read_unlock();
1357
1358         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1359
1360         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1361 }
1362
1363 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1364 {
1365         /* we aren't holding j->lock: */
1366         unsigned new_size = READ_ONCE(j->buf_size_want);
1367         void *new_buf;
1368
1369         if (buf->buf_size >= new_size)
1370                 return;
1371
1372         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
1373         if (!new_buf)
1374                 return;
1375
1376         memcpy(new_buf, buf->data, buf->buf_size);
1377
1378         spin_lock(&j->lock);
1379         swap(buf->data,         new_buf);
1380         swap(buf->buf_size,     new_size);
1381         spin_unlock(&j->lock);
1382
1383         kvpfree(new_buf, new_size);
1384 }
1385
1386 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1387 {
1388         return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1389 }
1390
1391 static void journal_write_done(struct closure *cl)
1392 {
1393         struct journal *j = container_of(cl, struct journal, io);
1394         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1395         struct journal_buf *w = journal_last_unwritten_buf(j);
1396         struct bch_replicas_padded replicas;
1397         union journal_res_state old, new;
1398         u64 v, seq;
1399         int err = 0;
1400
1401         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1402                                ? j->flush_write_time
1403                                : j->noflush_write_time, j->write_start_time);
1404
1405         if (!w->devs_written.nr) {
1406                 bch_err(c, "unable to write journal to sufficient devices");
1407                 err = -EIO;
1408         } else {
1409                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1410                                          w->devs_written);
1411                 if (bch2_mark_replicas(c, &replicas.e))
1412                         err = -EIO;
1413         }
1414
1415         if (err)
1416                 bch2_fatal_error(c);
1417
1418         spin_lock(&j->lock);
1419         seq = le64_to_cpu(w->data->seq);
1420
1421         if (seq >= j->pin.front)
1422                 journal_seq_pin(j, seq)->devs = w->devs_written;
1423
1424         if (!err) {
1425                 if (!JSET_NO_FLUSH(w->data)) {
1426                         j->flushed_seq_ondisk = seq;
1427                         j->last_seq_ondisk = w->last_seq;
1428
1429                         bch2_do_discards(c);
1430                         closure_wake_up(&c->freelist_wait);
1431                 }
1432         } else if (!j->err_seq || seq < j->err_seq)
1433                 j->err_seq      = seq;
1434
1435         j->seq_ondisk           = seq;
1436
1437         /*
1438          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1439          * more buckets:
1440          *
1441          * Must come before signaling write completion, for
1442          * bch2_fs_journal_stop():
1443          */
1444         if (j->watermark)
1445                 journal_reclaim_kick(&c->journal);
1446
1447         /* also must come before signalling write completion: */
1448         closure_debug_destroy(cl);
1449
1450         v = atomic64_read(&j->reservations.counter);
1451         do {
1452                 old.v = new.v = v;
1453                 BUG_ON(journal_state_count(new, new.unwritten_idx));
1454
1455                 new.unwritten_idx++;
1456         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1457                                        old.v, new.v)) != old.v);
1458
1459         bch2_journal_space_available(j);
1460
1461         closure_wake_up(&w->wait);
1462         journal_wake(j);
1463
1464         if (!journal_state_count(new, new.unwritten_idx) &&
1465             journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1466                 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1467         } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1468                    new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1469                 struct journal_buf *buf = journal_cur_buf(j);
1470                 long delta = buf->expires - jiffies;
1471
1472                 /*
1473                  * We don't close a journal entry to write it while there's
1474                  * previous entries still in flight - the current journal entry
1475                  * might want to be written now:
1476                  */
1477
1478                 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
1479         }
1480
1481         spin_unlock(&j->lock);
1482 }
1483
1484 static void journal_write_endio(struct bio *bio)
1485 {
1486         struct bch_dev *ca = bio->bi_private;
1487         struct journal *j = &ca->fs->journal;
1488         struct journal_buf *w = journal_last_unwritten_buf(j);
1489         unsigned long flags;
1490
1491         if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1492                                le64_to_cpu(w->data->seq),
1493                                bch2_blk_status_to_str(bio->bi_status)) ||
1494             bch2_meta_write_fault("journal")) {
1495                 spin_lock_irqsave(&j->err_lock, flags);
1496                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1497                 spin_unlock_irqrestore(&j->err_lock, flags);
1498         }
1499
1500         closure_put(&j->io);
1501         percpu_ref_put(&ca->io_ref);
1502 }
1503
1504 static void do_journal_write(struct closure *cl)
1505 {
1506         struct journal *j = container_of(cl, struct journal, io);
1507         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1508         struct bch_dev *ca;
1509         struct journal_buf *w = journal_last_unwritten_buf(j);
1510         struct bch_extent_ptr *ptr;
1511         struct bio *bio;
1512         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1513
1514         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1515                 ca = bch_dev_bkey_exists(c, ptr->dev);
1516                 if (!percpu_ref_tryget(&ca->io_ref)) {
1517                         /* XXX: fix this */
1518                         bch_err(c, "missing device for journal write\n");
1519                         continue;
1520                 }
1521
1522                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1523                              sectors);
1524
1525                 bio = ca->journal.bio;
1526                 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1527                 bio->bi_iter.bi_sector  = ptr->offset;
1528                 bio->bi_end_io          = journal_write_endio;
1529                 bio->bi_private         = ca;
1530
1531                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1532                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1533
1534                 if (!JSET_NO_FLUSH(w->data))
1535                         bio->bi_opf    |= REQ_FUA;
1536                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1537                         bio->bi_opf    |= REQ_PREFLUSH;
1538
1539                 bch2_bio_map(bio, w->data, sectors << 9);
1540
1541                 trace_journal_write(bio);
1542                 closure_bio_submit(bio, cl);
1543
1544                 ca->journal.bucket_seq[ca->journal.cur_idx] =
1545                         le64_to_cpu(w->data->seq);
1546         }
1547
1548         continue_at(cl, journal_write_done, c->io_complete_wq);
1549         return;
1550 }
1551
1552 void bch2_journal_write(struct closure *cl)
1553 {
1554         struct journal *j = container_of(cl, struct journal, io);
1555         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1556         struct bch_dev *ca;
1557         struct journal_buf *w = journal_last_unwritten_buf(j);
1558         struct jset_entry *start, *end;
1559         struct jset *jset;
1560         struct bio *bio;
1561         struct printbuf journal_debug_buf = PRINTBUF;
1562         bool validate_before_checksum = false;
1563         unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
1564         int ret;
1565
1566         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1567
1568         journal_buf_realloc(j, w);
1569         jset = w->data;
1570
1571         j->write_start_time = local_clock();
1572
1573         spin_lock(&j->lock);
1574         if (bch2_journal_error(j) ||
1575             w->noflush ||
1576             (!w->must_flush &&
1577              (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1578              test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1579                 w->noflush = true;
1580                 SET_JSET_NO_FLUSH(jset, true);
1581                 jset->last_seq  = 0;
1582                 w->last_seq     = 0;
1583
1584                 j->nr_noflush_writes++;
1585         } else {
1586                 j->last_flush_write = jiffies;
1587                 j->nr_flush_writes++;
1588         }
1589         spin_unlock(&j->lock);
1590
1591         /*
1592          * New btree roots are set by journalling them; when the journal entry
1593          * gets written we have to propagate them to c->btree_roots
1594          *
1595          * But, every journal entry we write has to contain all the btree roots
1596          * (at least for now); so after we copy btree roots to c->btree_roots we
1597          * have to get any missing btree roots and add them to this journal
1598          * entry:
1599          */
1600
1601         bch2_journal_entries_to_btree_roots(c, jset);
1602
1603         start = end = vstruct_last(jset);
1604
1605         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1606
1607         bch2_journal_super_entries_add_common(c, &end,
1608                                 le64_to_cpu(jset->seq));
1609         u64s    = (u64 *) end - (u64 *) start;
1610         BUG_ON(u64s > j->entry_u64s_reserved);
1611
1612         le32_add_cpu(&jset->u64s, u64s);
1613         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1614
1615         jset->magic             = cpu_to_le64(jset_magic(c));
1616         jset->version           = c->sb.version < bcachefs_metadata_version_bkey_renumber
1617                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1618                 : cpu_to_le32(c->sb.version);
1619
1620         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1621         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1622
1623         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1624                 j->last_empty_seq = le64_to_cpu(jset->seq);
1625
1626         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1627                 validate_before_checksum = true;
1628
1629         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1630                 validate_before_checksum = true;
1631
1632         if (validate_before_checksum &&
1633             jset_validate_for_write(c, jset))
1634                 goto err;
1635
1636         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1637                     jset->encrypted_start,
1638                     vstruct_end(jset) - (void *) jset->encrypted_start);
1639         if (bch2_fs_fatal_err_on(ret, c,
1640                         "error decrypting journal entry: %i", ret))
1641                 goto err;
1642
1643         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1644                                   journal_nonce(jset), jset);
1645
1646         if (!validate_before_checksum &&
1647             jset_validate_for_write(c, jset))
1648                 goto err;
1649
1650         sectors = vstruct_sectors(jset, c->block_bits);
1651         BUG_ON(sectors > w->sectors);
1652
1653         bytes = vstruct_bytes(jset);
1654         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1655
1656 retry_alloc:
1657         spin_lock(&j->lock);
1658         ret = journal_write_alloc(j, w, sectors);
1659
1660         if (ret && j->can_discard) {
1661                 spin_unlock(&j->lock);
1662                 bch2_journal_do_discards(j);
1663                 goto retry_alloc;
1664         }
1665
1666         if (ret)
1667                 __bch2_journal_debug_to_text(&journal_debug_buf, j);
1668
1669         /*
1670          * write is allocated, no longer need to account for it in
1671          * bch2_journal_space_available():
1672          */
1673         w->sectors = 0;
1674
1675         /*
1676          * journal entry has been compacted and allocated, recalculate space
1677          * available:
1678          */
1679         bch2_journal_space_available(j);
1680         spin_unlock(&j->lock);
1681
1682         if (ret) {
1683                 bch_err(c, "Unable to allocate journal write:\n%s",
1684                         journal_debug_buf.buf);
1685                 printbuf_exit(&journal_debug_buf);
1686                 bch2_fatal_error(c);
1687                 continue_at(cl, journal_write_done, c->io_complete_wq);
1688                 return;
1689         }
1690
1691         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1692
1693         if (c->opts.nochanges)
1694                 goto no_io;
1695
1696         for_each_rw_member(ca, c, i)
1697                 nr_rw_members++;
1698
1699         if (nr_rw_members > 1)
1700                 w->separate_flush = true;
1701
1702         if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1703                 for_each_rw_member(ca, c, i) {
1704                         percpu_ref_get(&ca->io_ref);
1705
1706                         bio = ca->journal.bio;
1707                         bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1708                         bio->bi_end_io          = journal_write_endio;
1709                         bio->bi_private         = ca;
1710                         closure_bio_submit(bio, cl);
1711                 }
1712         }
1713
1714         continue_at(cl, do_journal_write, c->io_complete_wq);
1715         return;
1716 no_io:
1717         continue_at(cl, journal_write_done, c->io_complete_wq);
1718         return;
1719 err:
1720         bch2_fatal_error(c);
1721         continue_at(cl, journal_write_done, c->io_complete_wq);
1722 }