]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c
Update bcachefs sources to 90a9c61e2b bcachefs: Switch bch2_btree_delete_range()...
[bcachefs-tools-debian] / libbcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_io.h"
6 #include "btree_update_interior.h"
7 #include "buckets.h"
8 #include "checksum.h"
9 #include "disk_groups.h"
10 #include "error.h"
11 #include "io.h"
12 #include "journal.h"
13 #include "journal_io.h"
14 #include "journal_reclaim.h"
15 #include "journal_seq_blacklist.h"
16 #include "replicas.h"
17
18 #include <trace/events/bcachefs.h>
19
20 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
21 {
22         return (seq - c->journal_entries_base_seq) & (~0U >> 1);
23 }
24
25 static void __journal_replay_free(struct bch_fs *c,
26                                   struct journal_replay *i)
27 {
28         struct journal_replay **p =
29                 genradix_ptr(&c->journal_entries,
30                              journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
31
32         BUG_ON(*p != i);
33         *p = NULL;
34         kvpfree(i, offsetof(struct journal_replay, j) +
35                 vstruct_bytes(&i->j));
36 }
37
38 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
39 {
40         i->ignore = true;
41
42         if (!c->opts.read_entire_journal)
43                 __journal_replay_free(c, i);
44 }
45
46 struct journal_list {
47         struct closure          cl;
48         u64                     last_seq;
49         struct mutex            lock;
50         int                     ret;
51 };
52
53 #define JOURNAL_ENTRY_ADD_OK            0
54 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
55
56 /*
57  * Given a journal entry we just read, add it to the list of journal entries to
58  * be replayed:
59  */
60 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
61                              struct journal_ptr entry_ptr,
62                              struct journal_list *jlist, struct jset *j,
63                              bool bad)
64 {
65         struct genradix_iter iter;
66         struct journal_replay **_i, *i, *dup;
67         struct journal_ptr *ptr;
68         size_t bytes = vstruct_bytes(j);
69         u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
70         int ret = JOURNAL_ENTRY_ADD_OK;
71
72         /* Is this entry older than the range we need? */
73         if (!c->opts.read_entire_journal &&
74             le64_to_cpu(j->seq) < jlist->last_seq)
75                 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
76
77         /*
78          * genradixes are indexed by a ulong, not a u64, so we can't index them
79          * by sequence number directly: Assume instead that they will all fall
80          * within the range of +-2billion of the filrst one we find.
81          */
82         if (!c->journal_entries_base_seq)
83                 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
84
85         /* Drop entries we don't need anymore */
86         if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
87                 genradix_for_each_from(&c->journal_entries, iter, _i,
88                                        journal_entry_radix_idx(c, jlist->last_seq)) {
89                         i = *_i;
90
91                         if (!i || i->ignore)
92                                 continue;
93
94                         if (le64_to_cpu(i->j.seq) >= last_seq)
95                                 break;
96                         journal_replay_free(c, i);
97                 }
98         }
99
100         jlist->last_seq = max(jlist->last_seq, last_seq);
101
102         _i = genradix_ptr_alloc(&c->journal_entries,
103                                 journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
104                                 GFP_KERNEL);
105         if (!_i)
106                 return -ENOMEM;
107
108         /*
109          * Duplicate journal entries? If so we want the one that didn't have a
110          * checksum error:
111          */
112         dup = *_i;
113         if (dup) {
114                 if (dup->bad) {
115                         /* we'll replace @dup: */
116                 } else if (bad) {
117                         i = dup;
118                         goto found;
119                 } else {
120                         fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
121                                     memcmp(j, &dup->j, bytes), c,
122                                     "found duplicate but non identical journal entries (seq %llu)",
123                                     le64_to_cpu(j->seq));
124                         i = dup;
125                         goto found;
126                 }
127         }
128
129         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
130         if (!i)
131                 return -ENOMEM;
132
133         i->nr_ptrs       = 0;
134         i->bad          = bad;
135         i->ignore       = false;
136         memcpy(&i->j, j, bytes);
137
138         if (dup) {
139                 i->nr_ptrs = dup->nr_ptrs;
140                 memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
141                 __journal_replay_free(c, dup);
142         }
143
144
145         *_i = i;
146 found:
147         for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
148                 if (ptr->dev == ca->dev_idx) {
149                         bch_err(c, "duplicate journal entry %llu on same device",
150                                 le64_to_cpu(i->j.seq));
151                         goto out;
152                 }
153         }
154
155         if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
156                 bch_err(c, "found too many copies of journal entry %llu",
157                         le64_to_cpu(i->j.seq));
158                 goto out;
159         }
160
161         i->ptrs[i->nr_ptrs++] = entry_ptr;
162 out:
163 fsck_err:
164         return ret;
165 }
166
167 static struct nonce journal_nonce(const struct jset *jset)
168 {
169         return (struct nonce) {{
170                 [0] = 0,
171                 [1] = ((__le32 *) &jset->seq)[0],
172                 [2] = ((__le32 *) &jset->seq)[1],
173                 [3] = BCH_NONCE_JOURNAL,
174         }};
175 }
176
177 /* this fills in a range with empty jset_entries: */
178 static void journal_entry_null_range(void *start, void *end)
179 {
180         struct jset_entry *entry;
181
182         for (entry = start; entry != end; entry = vstruct_next(entry))
183                 memset(entry, 0, sizeof(*entry));
184 }
185
186 #define JOURNAL_ENTRY_REREAD    5
187 #define JOURNAL_ENTRY_NONE      6
188 #define JOURNAL_ENTRY_BAD       7
189
190 #define journal_entry_err(c, msg, ...)                                  \
191 ({                                                                      \
192         switch (write) {                                                \
193         case READ:                                                      \
194                 mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
195                 break;                                                  \
196         case WRITE:                                                     \
197                 bch_err(c, "corrupt metadata before write:\n"           \
198                         msg, ##__VA_ARGS__);                            \
199                 if (bch2_fs_inconsistent(c)) {                          \
200                         ret = -BCH_ERR_fsck_errors_not_fixed;           \
201                         goto fsck_err;                                  \
202                 }                                                       \
203                 break;                                                  \
204         }                                                               \
205         true;                                                           \
206 })
207
208 #define journal_entry_err_on(cond, c, msg, ...)                         \
209         ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
210
211 #define FSCK_DELETED_KEY        5
212
213 static int journal_validate_key(struct bch_fs *c, const char *where,
214                                 struct jset_entry *entry,
215                                 unsigned level, enum btree_id btree_id,
216                                 struct bkey_i *k,
217                                 unsigned version, int big_endian, int write)
218 {
219         void *next = vstruct_next(entry);
220         struct printbuf buf = PRINTBUF;
221         int ret = 0;
222
223         if (journal_entry_err_on(!k->k.u64s, c,
224                         "invalid key in %s at %s offset %zi/%u: k->u64s 0",
225                         bch2_jset_entry_types[entry->type], where,
226                         (u64 *) k - entry->_data,
227                         le16_to_cpu(entry->u64s))) {
228                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
229                 journal_entry_null_range(vstruct_next(entry), next);
230                 return FSCK_DELETED_KEY;
231         }
232
233         if (journal_entry_err_on((void *) bkey_next(k) >
234                                 (void *) vstruct_next(entry), c,
235                         "invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
236                         bch2_jset_entry_types[entry->type], where,
237                         (u64 *) k - entry->_data,
238                         le16_to_cpu(entry->u64s))) {
239                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
240                 journal_entry_null_range(vstruct_next(entry), next);
241                 return FSCK_DELETED_KEY;
242         }
243
244         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
245                         "invalid key in %s at %s offset %zi/%u: bad format %u",
246                         bch2_jset_entry_types[entry->type], where,
247                         (u64 *) k - entry->_data,
248                         le16_to_cpu(entry->u64s),
249                         k->k.format)) {
250                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
251                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
252                 journal_entry_null_range(vstruct_next(entry), next);
253                 return FSCK_DELETED_KEY;
254         }
255
256         if (!write)
257                 bch2_bkey_compat(level, btree_id, version, big_endian,
258                                  write, NULL, bkey_to_packed(k));
259
260         if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
261                               __btree_node_type(level, btree_id), write, &buf)) {
262                 printbuf_reset(&buf);
263                 prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:",
264                        bch2_jset_entry_types[entry->type], where,
265                        (u64 *) k - entry->_data,
266                        le16_to_cpu(entry->u64s));
267                 prt_newline(&buf);
268                 printbuf_indent_add(&buf, 2);
269
270                 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
271                 prt_newline(&buf);
272                 bch2_bkey_invalid(c, bkey_i_to_s_c(k),
273                                   __btree_node_type(level, btree_id), write, &buf);
274
275                 mustfix_fsck_err(c, "%s", buf.buf);
276
277                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
278                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
279                 journal_entry_null_range(vstruct_next(entry), next);
280
281                 printbuf_exit(&buf);
282                 return FSCK_DELETED_KEY;
283         }
284
285         if (write)
286                 bch2_bkey_compat(level, btree_id, version, big_endian,
287                                  write, NULL, bkey_to_packed(k));
288 fsck_err:
289         printbuf_exit(&buf);
290         return ret;
291 }
292
293 static int journal_entry_btree_keys_validate(struct bch_fs *c,
294                                              const char *where,
295                                              struct jset_entry *entry,
296                                              unsigned version, int big_endian, int write)
297 {
298         struct bkey_i *k = entry->start;
299
300         while (k != vstruct_last(entry)) {
301                 int ret = journal_validate_key(c, where, entry,
302                                                entry->level,
303                                                entry->btree_id,
304                                                k, version, big_endian, write);
305                 if (ret == FSCK_DELETED_KEY)
306                         continue;
307
308                 k = bkey_next(k);
309         }
310
311         return 0;
312 }
313
314 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
315                                              struct jset_entry *entry)
316 {
317         struct bkey_i *k;
318         bool first = true;
319
320         vstruct_for_each(entry, k) {
321                 if (!first) {
322                         prt_newline(out);
323                         prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
324                 }
325                 prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
326                 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
327                 first = false;
328         }
329 }
330
331 static int journal_entry_btree_root_validate(struct bch_fs *c,
332                                              const char *where,
333                                              struct jset_entry *entry,
334                                              unsigned version, int big_endian, int write)
335 {
336         struct bkey_i *k = entry->start;
337         int ret = 0;
338
339         if (journal_entry_err_on(!entry->u64s ||
340                                  le16_to_cpu(entry->u64s) != k->k.u64s, c,
341                                  "invalid btree root journal entry: wrong number of keys")) {
342                 void *next = vstruct_next(entry);
343                 /*
344                  * we don't want to null out this jset_entry,
345                  * just the contents, so that later we can tell
346                  * we were _supposed_ to have a btree root
347                  */
348                 entry->u64s = 0;
349                 journal_entry_null_range(vstruct_next(entry), next);
350                 return 0;
351         }
352
353         return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
354                                     version, big_endian, write);
355 fsck_err:
356         return ret;
357 }
358
359 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
360                                              struct jset_entry *entry)
361 {
362         journal_entry_btree_keys_to_text(out, c, entry);
363 }
364
365 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
366                                             const char *where,
367                                             struct jset_entry *entry,
368                                             unsigned version, int big_endian, int write)
369 {
370         /* obsolete, don't care: */
371         return 0;
372 }
373
374 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
375                                             struct jset_entry *entry)
376 {
377 }
378
379 static int journal_entry_blacklist_validate(struct bch_fs *c,
380                                             const char *where,
381                                             struct jset_entry *entry,
382                                             unsigned version, int big_endian, int write)
383 {
384         int ret = 0;
385
386         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
387                 "invalid journal seq blacklist entry: bad size")) {
388                 journal_entry_null_range(entry, vstruct_next(entry));
389         }
390 fsck_err:
391         return ret;
392 }
393
394 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
395                                             struct jset_entry *entry)
396 {
397         struct jset_entry_blacklist *bl =
398                 container_of(entry, struct jset_entry_blacklist, entry);
399
400         prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
401 }
402
403 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
404                                                const char *where,
405                                                struct jset_entry *entry,
406                                                unsigned version, int big_endian, int write)
407 {
408         struct jset_entry_blacklist_v2 *bl_entry;
409         int ret = 0;
410
411         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
412                 "invalid journal seq blacklist entry: bad size")) {
413                 journal_entry_null_range(entry, vstruct_next(entry));
414                 goto out;
415         }
416
417         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
418
419         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
420                                  le64_to_cpu(bl_entry->end), c,
421                 "invalid journal seq blacklist entry: start > end")) {
422                 journal_entry_null_range(entry, vstruct_next(entry));
423         }
424 out:
425 fsck_err:
426         return ret;
427 }
428
429 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
430                                                struct jset_entry *entry)
431 {
432         struct jset_entry_blacklist_v2 *bl =
433                 container_of(entry, struct jset_entry_blacklist_v2, entry);
434
435         prt_printf(out, "start=%llu end=%llu",
436                le64_to_cpu(bl->start),
437                le64_to_cpu(bl->end));
438 }
439
440 static int journal_entry_usage_validate(struct bch_fs *c,
441                                         const char *where,
442                                         struct jset_entry *entry,
443                                         unsigned version, int big_endian, int write)
444 {
445         struct jset_entry_usage *u =
446                 container_of(entry, struct jset_entry_usage, entry);
447         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
448         int ret = 0;
449
450         if (journal_entry_err_on(bytes < sizeof(*u),
451                                  c,
452                                  "invalid journal entry usage: bad size")) {
453                 journal_entry_null_range(entry, vstruct_next(entry));
454                 return ret;
455         }
456
457 fsck_err:
458         return ret;
459 }
460
461 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
462                                         struct jset_entry *entry)
463 {
464         struct jset_entry_usage *u =
465                 container_of(entry, struct jset_entry_usage, entry);
466
467         prt_printf(out, "type=%s v=%llu",
468                bch2_fs_usage_types[u->entry.btree_id],
469                le64_to_cpu(u->v));
470 }
471
472 static int journal_entry_data_usage_validate(struct bch_fs *c,
473                                         const char *where,
474                                         struct jset_entry *entry,
475                                         unsigned version, int big_endian, int write)
476 {
477         struct jset_entry_data_usage *u =
478                 container_of(entry, struct jset_entry_data_usage, entry);
479         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
480         int ret = 0;
481
482         if (journal_entry_err_on(bytes < sizeof(*u) ||
483                                  bytes < sizeof(*u) + u->r.nr_devs,
484                                  c,
485                                  "invalid journal entry usage: bad size")) {
486                 journal_entry_null_range(entry, vstruct_next(entry));
487                 return ret;
488         }
489
490 fsck_err:
491         return ret;
492 }
493
494 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
495                                              struct jset_entry *entry)
496 {
497         struct jset_entry_data_usage *u =
498                 container_of(entry, struct jset_entry_data_usage, entry);
499
500         bch2_replicas_entry_to_text(out, &u->r);
501         prt_printf(out, "=%llu", le64_to_cpu(u->v));
502 }
503
504 static int journal_entry_clock_validate(struct bch_fs *c,
505                                         const char *where,
506                                         struct jset_entry *entry,
507                                         unsigned version, int big_endian, int write)
508 {
509         struct jset_entry_clock *clock =
510                 container_of(entry, struct jset_entry_clock, entry);
511         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
512         int ret = 0;
513
514         if (journal_entry_err_on(bytes != sizeof(*clock),
515                                  c, "invalid journal entry clock: bad size")) {
516                 journal_entry_null_range(entry, vstruct_next(entry));
517                 return ret;
518         }
519
520         if (journal_entry_err_on(clock->rw > 1,
521                                  c, "invalid journal entry clock: bad rw")) {
522                 journal_entry_null_range(entry, vstruct_next(entry));
523                 return ret;
524         }
525
526 fsck_err:
527         return ret;
528 }
529
530 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
531                                         struct jset_entry *entry)
532 {
533         struct jset_entry_clock *clock =
534                 container_of(entry, struct jset_entry_clock, entry);
535
536         prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
537 }
538
539 static int journal_entry_dev_usage_validate(struct bch_fs *c,
540                                             const char *where,
541                                             struct jset_entry *entry,
542                                             unsigned version, int big_endian, int write)
543 {
544         struct jset_entry_dev_usage *u =
545                 container_of(entry, struct jset_entry_dev_usage, entry);
546         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
547         unsigned expected = sizeof(*u);
548         unsigned dev;
549         int ret = 0;
550
551         if (journal_entry_err_on(bytes < expected,
552                                  c, "invalid journal entry dev usage: bad size (%u < %u)",
553                                  bytes, expected)) {
554                 journal_entry_null_range(entry, vstruct_next(entry));
555                 return ret;
556         }
557
558         dev = le32_to_cpu(u->dev);
559
560         if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
561                                  c, "invalid journal entry dev usage: bad dev")) {
562                 journal_entry_null_range(entry, vstruct_next(entry));
563                 return ret;
564         }
565
566         if (journal_entry_err_on(u->pad,
567                                  c, "invalid journal entry dev usage: bad pad")) {
568                 journal_entry_null_range(entry, vstruct_next(entry));
569                 return ret;
570         }
571
572 fsck_err:
573         return ret;
574 }
575
576 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
577                                             struct jset_entry *entry)
578 {
579         struct jset_entry_dev_usage *u =
580                 container_of(entry, struct jset_entry_dev_usage, entry);
581         unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
582
583         prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
584
585         for (i = 0; i < nr_types; i++) {
586                 if (i < BCH_DATA_NR)
587                         prt_printf(out, " %s", bch2_data_types[i]);
588                 else
589                         prt_printf(out, " (unknown data type %u)", i);
590                 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
591                        le64_to_cpu(u->d[i].buckets),
592                        le64_to_cpu(u->d[i].sectors),
593                        le64_to_cpu(u->d[i].fragmented));
594         }
595
596         prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
597 }
598
599 static int journal_entry_log_validate(struct bch_fs *c,
600                                       const char *where,
601                                       struct jset_entry *entry,
602                                       unsigned version, int big_endian, int write)
603 {
604         return 0;
605 }
606
607 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
608                                       struct jset_entry *entry)
609 {
610         struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
611         unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
612
613         prt_printf(out, "%.*s", bytes, l->d);
614 }
615
616 static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
617                                       struct jset_entry *entry,
618                                       unsigned version, int big_endian, int write)
619 {
620         return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
621 }
622
623 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
624                                             struct jset_entry *entry)
625 {
626         journal_entry_btree_keys_to_text(out, c, entry);
627 }
628
629 struct jset_entry_ops {
630         int (*validate)(struct bch_fs *, const char *,
631                         struct jset_entry *, unsigned, int, int);
632         void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
633 };
634
635 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
636 #define x(f, nr)                                                \
637         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
638                 .validate       = journal_entry_##f##_validate, \
639                 .to_text        = journal_entry_##f##_to_text,  \
640         },
641         BCH_JSET_ENTRY_TYPES()
642 #undef x
643 };
644
645 int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
646                                 struct jset_entry *entry,
647                                 unsigned version, int big_endian, int write)
648 {
649         return entry->type < BCH_JSET_ENTRY_NR
650                 ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
651                                 version, big_endian, write)
652                 : 0;
653 }
654
655 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
656                                 struct jset_entry *entry)
657 {
658         if (entry->type < BCH_JSET_ENTRY_NR) {
659                 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
660                 bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
661         } else {
662                 prt_printf(out, "(unknown type %u)", entry->type);
663         }
664 }
665
666 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
667                                  int write)
668 {
669         char buf[100];
670         struct jset_entry *entry;
671         int ret = 0;
672
673         vstruct_for_each(jset, entry) {
674                 scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
675                           le64_to_cpu(jset->seq),
676                           (u64 *) entry - jset->_data,
677                           le32_to_cpu(jset->u64s));
678
679                 if (journal_entry_err_on(vstruct_next(entry) >
680                                          vstruct_last(jset), c,
681                                 "journal entry extends past end of jset")) {
682                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
683                         break;
684                 }
685
686                 ret = bch2_journal_entry_validate(c, buf, entry,
687                                         le32_to_cpu(jset->version),
688                                         JSET_BIG_ENDIAN(jset), write);
689                 if (ret)
690                         break;
691         }
692 fsck_err:
693         return ret;
694 }
695
696 static int jset_validate(struct bch_fs *c,
697                          struct bch_dev *ca,
698                          struct jset *jset, u64 sector,
699                          unsigned bucket_sectors_left,
700                          unsigned sectors_read,
701                          int write)
702 {
703         size_t bytes = vstruct_bytes(jset);
704         struct bch_csum csum;
705         unsigned version;
706         int ret = 0;
707
708         if (le64_to_cpu(jset->magic) != jset_magic(c))
709                 return JOURNAL_ENTRY_NONE;
710
711         version = le32_to_cpu(jset->version);
712         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
713                                   version < bcachefs_metadata_version_min) ||
714                                  version >= bcachefs_metadata_version_max, c,
715                         "%s sector %llu seq %llu: unknown journal entry version %u",
716                         ca ? ca->name : c->name,
717                         sector, le64_to_cpu(jset->seq),
718                         version)) {
719                 /* don't try to continue: */
720                 return EINVAL;
721         }
722
723         if (bytes > (sectors_read << 9) &&
724             sectors_read < bucket_sectors_left)
725                 return JOURNAL_ENTRY_REREAD;
726
727         if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
728                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
729                         ca ? ca->name : c->name,
730                         sector, le64_to_cpu(jset->seq), bytes)) {
731                 ret = JOURNAL_ENTRY_BAD;
732                 le32_add_cpu(&jset->u64s,
733                              -((bytes - (bucket_sectors_left << 9)) / 8));
734         }
735
736         if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
737                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
738                         ca ? ca->name : c->name,
739                         sector, le64_to_cpu(jset->seq),
740                         JSET_CSUM_TYPE(jset))) {
741                 ret = JOURNAL_ENTRY_BAD;
742                 goto csum_done;
743         }
744
745         if (write)
746                 goto csum_done;
747
748         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
749         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
750                                  "%s sector %llu seq %llu: journal checksum bad",
751                                  ca ? ca->name : c->name,
752                                  sector, le64_to_cpu(jset->seq)))
753                 ret = JOURNAL_ENTRY_BAD;
754
755         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
756                      jset->encrypted_start,
757                      vstruct_end(jset) - (void *) jset->encrypted_start);
758         bch2_fs_fatal_err_on(ret, c,
759                         "error decrypting journal entry: %i", ret);
760 csum_done:
761         /* last_seq is ignored when JSET_NO_FLUSH is true */
762         if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
763                                  le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
764                                  "invalid journal entry: last_seq > seq (%llu > %llu)",
765                                  le64_to_cpu(jset->last_seq),
766                                  le64_to_cpu(jset->seq))) {
767                 jset->last_seq = jset->seq;
768                 return JOURNAL_ENTRY_BAD;
769         }
770 fsck_err:
771         return ret;
772 }
773
774 static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
775 {
776         unsigned sectors = vstruct_sectors(jset, c->block_bits);
777
778         return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
779                 jset_validate_entries(c, jset, WRITE);
780 }
781
782 struct journal_read_buf {
783         void            *data;
784         size_t          size;
785 };
786
787 static int journal_read_buf_realloc(struct journal_read_buf *b,
788                                     size_t new_size)
789 {
790         void *n;
791
792         /* the bios are sized for this many pages, max: */
793         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
794                 return -ENOMEM;
795
796         new_size = roundup_pow_of_two(new_size);
797         n = kvpmalloc(new_size, GFP_KERNEL);
798         if (!n)
799                 return -ENOMEM;
800
801         kvpfree(b->data, b->size);
802         b->data = n;
803         b->size = new_size;
804         return 0;
805 }
806
807 static int journal_read_bucket(struct bch_dev *ca,
808                                struct journal_read_buf *buf,
809                                struct journal_list *jlist,
810                                unsigned bucket)
811 {
812         struct bch_fs *c = ca->fs;
813         struct journal_device *ja = &ca->journal;
814         struct jset *j = NULL;
815         unsigned sectors, sectors_read = 0;
816         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
817             end = offset + ca->mi.bucket_size;
818         bool saw_bad = false;
819         int ret = 0;
820
821         pr_debug("reading %u", bucket);
822
823         while (offset < end) {
824                 if (!sectors_read) {
825                         struct bio *bio;
826                         unsigned nr_bvecs;
827 reread:
828                         sectors_read = min_t(unsigned,
829                                 end - offset, buf->size >> 9);
830                         nr_bvecs = buf_pages(buf->data, sectors_read << 9);
831
832                         bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
833                         bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
834
835                         bio->bi_iter.bi_sector = offset;
836                         bch2_bio_map(bio, buf->data, sectors_read << 9);
837
838                         ret = submit_bio_wait(bio);
839                         kfree(bio);
840
841                         if (bch2_dev_io_err_on(ret, ca,
842                                                "journal read error: sector %llu",
843                                                offset) ||
844                             bch2_meta_read_fault("journal")) {
845                                 /*
846                                  * We don't error out of the recovery process
847                                  * here, since the relevant journal entry may be
848                                  * found on a different device, and missing or
849                                  * no journal entries will be handled later
850                                  */
851                                 return 0;
852                         }
853
854                         j = buf->data;
855                 }
856
857                 ret = jset_validate(c, ca, j, offset,
858                                     end - offset, sectors_read,
859                                     READ);
860                 switch (ret) {
861                 case 0:
862                         sectors = vstruct_sectors(j, c->block_bits);
863                         break;
864                 case JOURNAL_ENTRY_REREAD:
865                         if (vstruct_bytes(j) > buf->size) {
866                                 ret = journal_read_buf_realloc(buf,
867                                                         vstruct_bytes(j));
868                                 if (ret)
869                                         return ret;
870                         }
871                         goto reread;
872                 case JOURNAL_ENTRY_NONE:
873                         if (!saw_bad)
874                                 return 0;
875                         sectors = block_sectors(c);
876                         goto next_block;
877                 case JOURNAL_ENTRY_BAD:
878                         saw_bad = true;
879                         /*
880                          * On checksum error we don't really trust the size
881                          * field of the journal entry we read, so try reading
882                          * again at next block boundary:
883                          */
884                         sectors = block_sectors(c);
885                         break;
886                 default:
887                         return ret;
888                 }
889
890                 /*
891                  * This happens sometimes if we don't have discards on -
892                  * when we've partially overwritten a bucket with new
893                  * journal entries. We don't need the rest of the
894                  * bucket:
895                  */
896                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
897                         return 0;
898
899                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
900
901                 mutex_lock(&jlist->lock);
902                 ret = journal_entry_add(c, ca, (struct journal_ptr) {
903                                         .dev            = ca->dev_idx,
904                                         .bucket         = bucket,
905                                         .bucket_offset  = offset -
906                                                 bucket_to_sector(ca, ja->buckets[bucket]),
907                                         .sector         = offset,
908                                         }, jlist, j, ret != 0);
909                 mutex_unlock(&jlist->lock);
910
911                 switch (ret) {
912                 case JOURNAL_ENTRY_ADD_OK:
913                         break;
914                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
915                         break;
916                 default:
917                         return ret;
918                 }
919 next_block:
920                 pr_debug("next");
921                 offset          += sectors;
922                 sectors_read    -= sectors;
923                 j = ((void *) j) + (sectors << 9);
924         }
925
926         return 0;
927 }
928
929 static void bch2_journal_read_device(struct closure *cl)
930 {
931         struct journal_device *ja =
932                 container_of(cl, struct journal_device, read);
933         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
934         struct bch_fs *c = ca->fs;
935         struct journal_list *jlist =
936                 container_of(cl->parent, struct journal_list, cl);
937         struct journal_replay *r, **_r;
938         struct genradix_iter iter;
939         struct journal_read_buf buf = { NULL, 0 };
940         u64 min_seq = U64_MAX;
941         unsigned i;
942         int ret = 0;
943
944         if (!ja->nr)
945                 goto out;
946
947         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
948         if (ret)
949                 goto err;
950
951         pr_debug("%u journal buckets", ja->nr);
952
953         for (i = 0; i < ja->nr; i++) {
954                 ret = journal_read_bucket(ca, &buf, jlist, i);
955                 if (ret)
956                         goto err;
957         }
958
959         /* Find the journal bucket with the highest sequence number: */
960         for (i = 0; i < ja->nr; i++) {
961                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
962                         ja->cur_idx = i;
963
964                 min_seq = min(ja->bucket_seq[i], min_seq);
965         }
966
967         /*
968          * If there's duplicate journal entries in multiple buckets (which
969          * definitely isn't supposed to happen, but...) - make sure to start
970          * cur_idx at the last of those buckets, so we don't deadlock trying to
971          * allocate
972          */
973         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
974                ja->bucket_seq[ja->cur_idx] ==
975                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
976                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
977
978         ja->sectors_free = ca->mi.bucket_size;
979
980         mutex_lock(&jlist->lock);
981         genradix_for_each(&c->journal_entries, iter, _r) {
982                 r = *_r;
983
984                 if (!r)
985                         continue;
986
987                 for (i = 0; i < r->nr_ptrs; i++) {
988                         if (r->ptrs[i].dev == ca->dev_idx &&
989                             sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
990                                 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
991                                         vstruct_sectors(&r->j, c->block_bits);
992
993                                 ja->sectors_free = min(ja->sectors_free,
994                                                        ca->mi.bucket_size - wrote);
995                         }
996                 }
997         }
998         mutex_unlock(&jlist->lock);
999
1000         if (ja->bucket_seq[ja->cur_idx] &&
1001             ja->sectors_free == ca->mi.bucket_size) {
1002                 bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
1003                 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
1004                 for (i = 0; i < 3; i++) {
1005                         unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
1006                         bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
1007                 }
1008                 ja->sectors_free = 0;
1009         }
1010
1011         /*
1012          * Set dirty_idx to indicate the entire journal is full and needs to be
1013          * reclaimed - journal reclaim will immediately reclaim whatever isn't
1014          * pinned when it first runs:
1015          */
1016         ja->discard_idx = ja->dirty_idx_ondisk =
1017                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
1018 out:
1019         bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
1020         kvpfree(buf.data, buf.size);
1021         percpu_ref_put(&ca->io_ref);
1022         closure_return(cl);
1023         return;
1024 err:
1025         mutex_lock(&jlist->lock);
1026         jlist->ret = ret;
1027         mutex_unlock(&jlist->lock);
1028         goto out;
1029 }
1030
1031 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
1032                                struct journal_replay *j)
1033 {
1034         unsigned i;
1035
1036         for (i = 0; i < j->nr_ptrs; i++) {
1037                 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
1038                 u64 offset;
1039
1040                 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
1041
1042                 if (i)
1043                         prt_printf(out, " ");
1044                 prt_printf(out, "%u:%u:%u (sector %llu)",
1045                        j->ptrs[i].dev,
1046                        j->ptrs[i].bucket,
1047                        j->ptrs[i].bucket_offset,
1048                        j->ptrs[i].sector);
1049         }
1050 }
1051
1052 int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
1053 {
1054         struct journal_list jlist;
1055         struct journal_replay *i, **_i, *prev = NULL;
1056         struct genradix_iter radix_iter;
1057         struct bch_dev *ca;
1058         unsigned iter;
1059         struct printbuf buf = PRINTBUF;
1060         size_t keys = 0, entries = 0;
1061         bool degraded = false;
1062         u64 seq, last_seq = 0;
1063         int ret = 0;
1064
1065         closure_init_stack(&jlist.cl);
1066         mutex_init(&jlist.lock);
1067         jlist.last_seq = 0;
1068         jlist.ret = 0;
1069
1070         for_each_member_device(ca, c, iter) {
1071                 if (!c->opts.fsck &&
1072                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
1073                         continue;
1074
1075                 if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
1076                      ca->mi.state == BCH_MEMBER_STATE_ro) &&
1077                     percpu_ref_tryget(&ca->io_ref))
1078                         closure_call(&ca->journal.read,
1079                                      bch2_journal_read_device,
1080                                      system_unbound_wq,
1081                                      &jlist.cl);
1082                 else
1083                         degraded = true;
1084         }
1085
1086         closure_sync(&jlist.cl);
1087
1088         if (jlist.ret)
1089                 return jlist.ret;
1090
1091         *start_seq = 0;
1092
1093         /*
1094          * Find most recent flush entry, and ignore newer non flush entries -
1095          * those entries will be blacklisted:
1096          */
1097         genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
1098                 i = *_i;
1099
1100                 if (!i || i->ignore)
1101                         continue;
1102
1103                 if (!*start_seq)
1104                         *start_seq = le64_to_cpu(i->j.seq) + 1;
1105
1106                 if (!JSET_NO_FLUSH(&i->j)) {
1107                         last_seq        = le64_to_cpu(i->j.last_seq);
1108                         *blacklist_seq  = le64_to_cpu(i->j.seq) + 1;
1109                         break;
1110                 }
1111
1112                 journal_replay_free(c, i);
1113         }
1114
1115         if (!*start_seq) {
1116                 bch_info(c, "journal read done, but no entries found");
1117                 return 0;
1118         }
1119
1120         if (!last_seq) {
1121                 fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
1122                 ret = -1;
1123                 goto err;
1124         }
1125
1126         /* Drop blacklisted entries and entries older than last_seq: */
1127         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1128                 i = *_i;
1129
1130                 if (!i || i->ignore)
1131                         continue;
1132
1133                 seq = le64_to_cpu(i->j.seq);
1134                 if (seq < last_seq) {
1135                         journal_replay_free(c, i);
1136                         continue;
1137                 }
1138
1139                 if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
1140                         fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
1141                                     "found blacklisted journal entry %llu", seq);
1142
1143                         journal_replay_free(c, i);
1144                 }
1145         }
1146
1147         /* Check for missing entries: */
1148         seq = last_seq;
1149         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1150                 i = *_i;
1151
1152                 if (!i || i->ignore)
1153                         continue;
1154
1155                 BUG_ON(seq > le64_to_cpu(i->j.seq));
1156
1157                 while (seq < le64_to_cpu(i->j.seq)) {
1158                         u64 missing_start, missing_end;
1159                         struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1160
1161                         while (seq < le64_to_cpu(i->j.seq) &&
1162                                bch2_journal_seq_is_blacklisted(c, seq, false))
1163                                 seq++;
1164
1165                         if (seq == le64_to_cpu(i->j.seq))
1166                                 break;
1167
1168                         missing_start = seq;
1169
1170                         while (seq < le64_to_cpu(i->j.seq) &&
1171                                !bch2_journal_seq_is_blacklisted(c, seq, false))
1172                                 seq++;
1173
1174                         if (prev) {
1175                                 bch2_journal_ptrs_to_text(&buf1, c, prev);
1176                                 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
1177                         } else
1178                                 prt_printf(&buf1, "(none)");
1179                         bch2_journal_ptrs_to_text(&buf2, c, i);
1180
1181                         missing_end = seq - 1;
1182                         fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
1183                                  "  prev at %s\n"
1184                                  "  next at %s",
1185                                  missing_start, missing_end,
1186                                  last_seq, *blacklist_seq - 1,
1187                                  buf1.buf, buf2.buf);
1188
1189                         printbuf_exit(&buf1);
1190                         printbuf_exit(&buf2);
1191                 }
1192
1193                 prev = i;
1194                 seq++;
1195         }
1196
1197         genradix_for_each(&c->journal_entries, radix_iter, _i) {
1198                 struct jset_entry *entry;
1199                 struct bkey_i *k, *_n;
1200                 struct bch_replicas_padded replicas = {
1201                         .e.data_type = BCH_DATA_journal,
1202                         .e.nr_required = 1,
1203                 };
1204                 unsigned ptr;
1205
1206                 i = *_i;
1207                 if (!i || i->ignore)
1208                         continue;
1209
1210                 ret = jset_validate_entries(c, &i->j, READ);
1211                 if (ret)
1212                         goto err;
1213
1214                 for (ptr = 0; ptr < i->nr_ptrs; ptr++)
1215                         replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
1216
1217                 bch2_replicas_entry_sort(&replicas.e);
1218
1219                 /*
1220                  * If we're mounting in degraded mode - if we didn't read all
1221                  * the devices - this is wrong:
1222                  */
1223
1224                 printbuf_reset(&buf);
1225                 bch2_replicas_entry_to_text(&buf, &replicas.e);
1226
1227                 if (!degraded &&
1228                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
1229                                 "superblock not marked as containing replicas %s",
1230                                 buf.buf)) {
1231                         ret = bch2_mark_replicas(c, &replicas.e);
1232                         if (ret)
1233                                 goto err;
1234                 }
1235
1236                 for_each_jset_key(k, _n, entry, &i->j)
1237                         keys++;
1238                 entries++;
1239         }
1240
1241         bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
1242                  keys, entries, *start_seq);
1243
1244         if (*start_seq != *blacklist_seq)
1245                 bch_info(c, "dropped unflushed entries %llu-%llu",
1246                          *blacklist_seq, *start_seq - 1);
1247 err:
1248 fsck_err:
1249         printbuf_exit(&buf);
1250         return ret;
1251 }
1252
1253 /* journal write: */
1254
1255 static void __journal_write_alloc(struct journal *j,
1256                                   struct journal_buf *w,
1257                                   struct dev_alloc_list *devs_sorted,
1258                                   unsigned sectors,
1259                                   unsigned *replicas,
1260                                   unsigned replicas_want)
1261 {
1262         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1263         struct journal_device *ja;
1264         struct bch_dev *ca;
1265         unsigned i;
1266
1267         if (*replicas >= replicas_want)
1268                 return;
1269
1270         for (i = 0; i < devs_sorted->nr; i++) {
1271                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
1272                 if (!ca)
1273                         continue;
1274
1275                 ja = &ca->journal;
1276
1277                 /*
1278                  * Check that we can use this device, and aren't already using
1279                  * it:
1280                  */
1281                 if (!ca->mi.durability ||
1282                     ca->mi.state != BCH_MEMBER_STATE_rw ||
1283                     !ja->nr ||
1284                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
1285                                          ca->dev_idx) ||
1286                     sectors > ja->sectors_free)
1287                         continue;
1288
1289                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
1290
1291                 bch2_bkey_append_ptr(&w->key,
1292                         (struct bch_extent_ptr) {
1293                                   .offset = bucket_to_sector(ca,
1294                                         ja->buckets[ja->cur_idx]) +
1295                                         ca->mi.bucket_size -
1296                                         ja->sectors_free,
1297                                   .dev = ca->dev_idx,
1298                 });
1299
1300                 ja->sectors_free -= sectors;
1301                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1302
1303                 *replicas += ca->mi.durability;
1304
1305                 if (*replicas >= replicas_want)
1306                         break;
1307         }
1308 }
1309
1310 /**
1311  * journal_next_bucket - move on to the next journal bucket if possible
1312  */
1313 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
1314                                unsigned sectors)
1315 {
1316         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1317         struct bch_devs_mask devs;
1318         struct journal_device *ja;
1319         struct bch_dev *ca;
1320         struct dev_alloc_list devs_sorted;
1321         unsigned target = c->opts.metadata_target ?:
1322                 c->opts.foreground_target;
1323         unsigned i, replicas = 0, replicas_want =
1324                 READ_ONCE(c->opts.metadata_replicas);
1325
1326         rcu_read_lock();
1327 retry:
1328         devs = target_rw_devs(c, BCH_DATA_journal, target);
1329
1330         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
1331
1332         __journal_write_alloc(j, w, &devs_sorted,
1333                               sectors, &replicas, replicas_want);
1334
1335         if (replicas >= replicas_want)
1336                 goto done;
1337
1338         for (i = 0; i < devs_sorted.nr; i++) {
1339                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
1340                 if (!ca)
1341                         continue;
1342
1343                 ja = &ca->journal;
1344
1345                 if (sectors > ja->sectors_free &&
1346                     sectors <= ca->mi.bucket_size &&
1347                     bch2_journal_dev_buckets_available(j, ja,
1348                                         journal_space_discarded)) {
1349                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
1350                         ja->sectors_free = ca->mi.bucket_size;
1351
1352                         /*
1353                          * ja->bucket_seq[ja->cur_idx] must always have
1354                          * something sensible:
1355                          */
1356                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
1357                 }
1358         }
1359
1360         __journal_write_alloc(j, w, &devs_sorted,
1361                               sectors, &replicas, replicas_want);
1362
1363         if (replicas < replicas_want && target) {
1364                 /* Retry from all devices: */
1365                 target = 0;
1366                 goto retry;
1367         }
1368 done:
1369         rcu_read_unlock();
1370
1371         BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
1372
1373         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
1374 }
1375
1376 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
1377 {
1378         /* we aren't holding j->lock: */
1379         unsigned new_size = READ_ONCE(j->buf_size_want);
1380         void *new_buf;
1381
1382         if (buf->buf_size >= new_size)
1383                 return;
1384
1385         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
1386         if (!new_buf)
1387                 return;
1388
1389         memcpy(new_buf, buf->data, buf->buf_size);
1390
1391         spin_lock(&j->lock);
1392         swap(buf->data,         new_buf);
1393         swap(buf->buf_size,     new_size);
1394         spin_unlock(&j->lock);
1395
1396         kvpfree(new_buf, new_size);
1397 }
1398
1399 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
1400 {
1401         return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
1402 }
1403
1404 static void journal_write_done(struct closure *cl)
1405 {
1406         struct journal *j = container_of(cl, struct journal, io);
1407         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1408         struct journal_buf *w = journal_last_unwritten_buf(j);
1409         struct bch_replicas_padded replicas;
1410         union journal_res_state old, new;
1411         u64 v, seq;
1412         int err = 0;
1413
1414         bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
1415                                ? j->flush_write_time
1416                                : j->noflush_write_time, j->write_start_time);
1417
1418         if (!w->devs_written.nr) {
1419                 bch_err(c, "unable to write journal to sufficient devices");
1420                 err = -EIO;
1421         } else {
1422                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
1423                                          w->devs_written);
1424                 if (bch2_mark_replicas(c, &replicas.e))
1425                         err = -EIO;
1426         }
1427
1428         if (err)
1429                 bch2_fatal_error(c);
1430
1431         spin_lock(&j->lock);
1432         seq = le64_to_cpu(w->data->seq);
1433
1434         if (seq >= j->pin.front)
1435                 journal_seq_pin(j, seq)->devs = w->devs_written;
1436
1437         if (!err) {
1438                 if (!JSET_NO_FLUSH(w->data)) {
1439                         j->flushed_seq_ondisk = seq;
1440                         j->last_seq_ondisk = w->last_seq;
1441
1442                         bch2_do_discards(c);
1443                         closure_wake_up(&c->freelist_wait);
1444                 }
1445         } else if (!j->err_seq || seq < j->err_seq)
1446                 j->err_seq      = seq;
1447
1448         j->seq_ondisk           = seq;
1449
1450         /*
1451          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
1452          * more buckets:
1453          *
1454          * Must come before signaling write completion, for
1455          * bch2_fs_journal_stop():
1456          */
1457         if (j->watermark)
1458                 journal_reclaim_kick(&c->journal);
1459
1460         /* also must come before signalling write completion: */
1461         closure_debug_destroy(cl);
1462
1463         v = atomic64_read(&j->reservations.counter);
1464         do {
1465                 old.v = new.v = v;
1466                 BUG_ON(journal_state_count(new, new.unwritten_idx));
1467
1468                 new.unwritten_idx++;
1469         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
1470                                        old.v, new.v)) != old.v);
1471
1472         bch2_journal_space_available(j);
1473
1474         closure_wake_up(&w->wait);
1475         journal_wake(j);
1476
1477         if (!journal_state_count(new, new.unwritten_idx) &&
1478             journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
1479                 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
1480         } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
1481                    new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
1482                 struct journal_buf *buf = journal_cur_buf(j);
1483                 long delta = buf->expires - jiffies;
1484
1485                 /*
1486                  * We don't close a journal entry to write it while there's
1487                  * previous entries still in flight - the current journal entry
1488                  * might want to be written now:
1489                  */
1490
1491                 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
1492         }
1493
1494         spin_unlock(&j->lock);
1495 }
1496
1497 static void journal_write_endio(struct bio *bio)
1498 {
1499         struct bch_dev *ca = bio->bi_private;
1500         struct journal *j = &ca->fs->journal;
1501         struct journal_buf *w = journal_last_unwritten_buf(j);
1502         unsigned long flags;
1503
1504         if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
1505                                le64_to_cpu(w->data->seq),
1506                                bch2_blk_status_to_str(bio->bi_status)) ||
1507             bch2_meta_write_fault("journal")) {
1508                 spin_lock_irqsave(&j->err_lock, flags);
1509                 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
1510                 spin_unlock_irqrestore(&j->err_lock, flags);
1511         }
1512
1513         closure_put(&j->io);
1514         percpu_ref_put(&ca->io_ref);
1515 }
1516
1517 static void do_journal_write(struct closure *cl)
1518 {
1519         struct journal *j = container_of(cl, struct journal, io);
1520         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1521         struct bch_dev *ca;
1522         struct journal_buf *w = journal_last_unwritten_buf(j);
1523         struct bch_extent_ptr *ptr;
1524         struct bio *bio;
1525         unsigned sectors = vstruct_sectors(w->data, c->block_bits);
1526
1527         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1528                 ca = bch_dev_bkey_exists(c, ptr->dev);
1529                 if (!percpu_ref_tryget(&ca->io_ref)) {
1530                         /* XXX: fix this */
1531                         bch_err(c, "missing device for journal write\n");
1532                         continue;
1533                 }
1534
1535                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1536                              sectors);
1537
1538                 bio = ca->journal.bio;
1539                 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
1540                 bio->bi_iter.bi_sector  = ptr->offset;
1541                 bio->bi_end_io          = journal_write_endio;
1542                 bio->bi_private         = ca;
1543
1544                 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
1545                 ca->prev_journal_sector = bio->bi_iter.bi_sector;
1546
1547                 if (!JSET_NO_FLUSH(w->data))
1548                         bio->bi_opf    |= REQ_FUA;
1549                 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
1550                         bio->bi_opf    |= REQ_PREFLUSH;
1551
1552                 bch2_bio_map(bio, w->data, sectors << 9);
1553
1554                 trace_journal_write(bio);
1555                 closure_bio_submit(bio, cl);
1556
1557                 ca->journal.bucket_seq[ca->journal.cur_idx] =
1558                         le64_to_cpu(w->data->seq);
1559         }
1560
1561         continue_at(cl, journal_write_done, c->io_complete_wq);
1562         return;
1563 }
1564
1565 void bch2_journal_write(struct closure *cl)
1566 {
1567         struct journal *j = container_of(cl, struct journal, io);
1568         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1569         struct bch_dev *ca;
1570         struct journal_buf *w = journal_last_unwritten_buf(j);
1571         struct jset_entry *start, *end;
1572         struct jset *jset;
1573         struct bio *bio;
1574         struct printbuf journal_debug_buf = PRINTBUF;
1575         bool validate_before_checksum = false;
1576         unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
1577         int ret;
1578
1579         BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
1580
1581         journal_buf_realloc(j, w);
1582         jset = w->data;
1583
1584         j->write_start_time = local_clock();
1585
1586         spin_lock(&j->lock);
1587         if (bch2_journal_error(j) ||
1588             w->noflush ||
1589             (!w->must_flush &&
1590              (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
1591              test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
1592                 w->noflush = true;
1593                 SET_JSET_NO_FLUSH(jset, true);
1594                 jset->last_seq  = 0;
1595                 w->last_seq     = 0;
1596
1597                 j->nr_noflush_writes++;
1598         } else {
1599                 j->last_flush_write = jiffies;
1600                 j->nr_flush_writes++;
1601         }
1602         spin_unlock(&j->lock);
1603
1604         /*
1605          * New btree roots are set by journalling them; when the journal entry
1606          * gets written we have to propagate them to c->btree_roots
1607          *
1608          * But, every journal entry we write has to contain all the btree roots
1609          * (at least for now); so after we copy btree roots to c->btree_roots we
1610          * have to get any missing btree roots and add them to this journal
1611          * entry:
1612          */
1613
1614         bch2_journal_entries_to_btree_roots(c, jset);
1615
1616         start = end = vstruct_last(jset);
1617
1618         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1619
1620         bch2_journal_super_entries_add_common(c, &end,
1621                                 le64_to_cpu(jset->seq));
1622         u64s    = (u64 *) end - (u64 *) start;
1623         BUG_ON(u64s > j->entry_u64s_reserved);
1624
1625         le32_add_cpu(&jset->u64s, u64s);
1626         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1627
1628         jset->magic             = cpu_to_le64(jset_magic(c));
1629         jset->version           = c->sb.version < bcachefs_metadata_version_bkey_renumber
1630                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1631                 : cpu_to_le32(c->sb.version);
1632
1633         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1634         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1635
1636         if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
1637                 j->last_empty_seq = le64_to_cpu(jset->seq);
1638
1639         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1640                 validate_before_checksum = true;
1641
1642         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
1643                 validate_before_checksum = true;
1644
1645         if (validate_before_checksum &&
1646             jset_validate_for_write(c, jset))
1647                 goto err;
1648
1649         ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1650                     jset->encrypted_start,
1651                     vstruct_end(jset) - (void *) jset->encrypted_start);
1652         if (bch2_fs_fatal_err_on(ret, c,
1653                         "error decrypting journal entry: %i", ret))
1654                 goto err;
1655
1656         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1657                                   journal_nonce(jset), jset);
1658
1659         if (!validate_before_checksum &&
1660             jset_validate_for_write(c, jset))
1661                 goto err;
1662
1663         sectors = vstruct_sectors(jset, c->block_bits);
1664         BUG_ON(sectors > w->sectors);
1665
1666         bytes = vstruct_bytes(jset);
1667         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1668
1669 retry_alloc:
1670         spin_lock(&j->lock);
1671         ret = journal_write_alloc(j, w, sectors);
1672
1673         if (ret && j->can_discard) {
1674                 spin_unlock(&j->lock);
1675                 bch2_journal_do_discards(j);
1676                 goto retry_alloc;
1677         }
1678
1679         if (ret)
1680                 __bch2_journal_debug_to_text(&journal_debug_buf, j);
1681
1682         /*
1683          * write is allocated, no longer need to account for it in
1684          * bch2_journal_space_available():
1685          */
1686         w->sectors = 0;
1687
1688         /*
1689          * journal entry has been compacted and allocated, recalculate space
1690          * available:
1691          */
1692         bch2_journal_space_available(j);
1693         spin_unlock(&j->lock);
1694
1695         if (ret) {
1696                 bch_err(c, "Unable to allocate journal write:\n%s",
1697                         journal_debug_buf.buf);
1698                 printbuf_exit(&journal_debug_buf);
1699                 bch2_fatal_error(c);
1700                 continue_at(cl, journal_write_done, c->io_complete_wq);
1701                 return;
1702         }
1703
1704         w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
1705
1706         if (c->opts.nochanges)
1707                 goto no_io;
1708
1709         for_each_rw_member(ca, c, i)
1710                 nr_rw_members++;
1711
1712         if (nr_rw_members > 1)
1713                 w->separate_flush = true;
1714
1715         if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
1716                 for_each_rw_member(ca, c, i) {
1717                         percpu_ref_get(&ca->io_ref);
1718
1719                         bio = ca->journal.bio;
1720                         bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
1721                         bio->bi_end_io          = journal_write_endio;
1722                         bio->bi_private         = ca;
1723                         closure_bio_submit(bio, cl);
1724                 }
1725         }
1726
1727         continue_at(cl, do_journal_write, c->io_complete_wq);
1728         return;
1729 no_io:
1730         continue_at(cl, journal_write_done, c->io_complete_wq);
1731         return;
1732 err:
1733         bch2_fatal_error(c);
1734         continue_at(cl, journal_write_done, c->io_complete_wq);
1735 }