]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c
7c157bc50268ceaec3972396bcbcd7a8b524878f
[bcachefs-tools-debian] / libbcachefs / journal_io.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "bcachefs.h"
3 #include "alloc_foreground.h"
4 #include "btree_io.h"
5 #include "btree_update_interior.h"
6 #include "buckets.h"
7 #include "checksum.h"
8 #include "error.h"
9 #include "io.h"
10 #include "journal.h"
11 #include "journal_io.h"
12 #include "journal_reclaim.h"
13 #include "replicas.h"
14
15 #include <trace/events/bcachefs.h>
16
17 struct journal_list {
18         struct closure          cl;
19         struct mutex            lock;
20         struct list_head        *head;
21         int                     ret;
22 };
23
24 #define JOURNAL_ENTRY_ADD_OK            0
25 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
26
27 /*
28  * Given a journal entry we just read, add it to the list of journal entries to
29  * be replayed:
30  */
31 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
32                              struct journal_list *jlist, struct jset *j,
33                              bool bad)
34 {
35         struct journal_replay *i, *pos;
36         struct bch_devs_list devs = { .nr = 0 };
37         struct list_head *where;
38         size_t bytes = vstruct_bytes(j);
39         __le64 last_seq;
40         int ret;
41
42         last_seq = !list_empty(jlist->head)
43                 ? list_last_entry(jlist->head, struct journal_replay,
44                                   list)->j.last_seq
45                 : 0;
46
47         if (!c->opts.read_entire_journal) {
48                 /* Is this entry older than the range we need? */
49                 if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
50                         ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
51                         goto out;
52                 }
53
54                 /* Drop entries we don't need anymore */
55                 list_for_each_entry_safe(i, pos, jlist->head, list) {
56                         if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
57                                 break;
58                         list_del(&i->list);
59                         kvpfree(i, offsetof(struct journal_replay, j) +
60                                 vstruct_bytes(&i->j));
61                 }
62         }
63
64         list_for_each_entry_reverse(i, jlist->head, list) {
65                 if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
66                         where = &i->list;
67                         goto add;
68                 }
69         }
70
71         where = jlist->head;
72 add:
73         i = where->next != jlist->head
74                 ? container_of(where->next, struct journal_replay, list)
75                 : NULL;
76
77         /*
78          * Duplicate journal entries? If so we want the one that didn't have a
79          * checksum error:
80          */
81         if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
82                 if (i->bad) {
83                         devs = i->devs;
84                         list_del(&i->list);
85                         kvpfree(i, offsetof(struct journal_replay, j) +
86                                 vstruct_bytes(&i->j));
87                 } else if (bad) {
88                         goto found;
89                 } else {
90                         fsck_err_on(bytes != vstruct_bytes(&i->j) ||
91                                     memcmp(j, &i->j, bytes), c,
92                                     "found duplicate but non identical journal entries (seq %llu)",
93                                     le64_to_cpu(j->seq));
94                         goto found;
95                 }
96
97         }
98
99         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
100         if (!i) {
101                 ret = -ENOMEM;
102                 goto out;
103         }
104
105         list_add(&i->list, where);
106         i->devs = devs;
107         i->bad  = bad;
108         memcpy(&i->j, j, bytes);
109 found:
110         if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
111                 bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
112         else
113                 fsck_err_on(1, c, "duplicate journal entries on same device");
114         ret = JOURNAL_ENTRY_ADD_OK;
115 out:
116 fsck_err:
117         return ret;
118 }
119
120 static struct nonce journal_nonce(const struct jset *jset)
121 {
122         return (struct nonce) {{
123                 [0] = 0,
124                 [1] = ((__le32 *) &jset->seq)[0],
125                 [2] = ((__le32 *) &jset->seq)[1],
126                 [3] = BCH_NONCE_JOURNAL,
127         }};
128 }
129
130 /* this fills in a range with empty jset_entries: */
131 static void journal_entry_null_range(void *start, void *end)
132 {
133         struct jset_entry *entry;
134
135         for (entry = start; entry != end; entry = vstruct_next(entry))
136                 memset(entry, 0, sizeof(*entry));
137 }
138
139 #define JOURNAL_ENTRY_REREAD    5
140 #define JOURNAL_ENTRY_NONE      6
141 #define JOURNAL_ENTRY_BAD       7
142
143 #define journal_entry_err(c, msg, ...)                                  \
144 ({                                                                      \
145         switch (write) {                                                \
146         case READ:                                                      \
147                 mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
148                 break;                                                  \
149         case WRITE:                                                     \
150                 bch_err(c, "corrupt metadata before write:\n"           \
151                         msg, ##__VA_ARGS__);                            \
152                 if (bch2_fs_inconsistent(c)) {                          \
153                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
154                         goto fsck_err;                                  \
155                 }                                                       \
156                 break;                                                  \
157         }                                                               \
158         true;                                                           \
159 })
160
161 #define journal_entry_err_on(cond, c, msg, ...)                         \
162         ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
163
164 #define FSCK_DELETED_KEY        5
165
166 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
167                                 struct jset_entry *entry,
168                                 unsigned level, enum btree_id btree_id,
169                                 struct bkey_i *k,
170                                 const char *type, int write)
171 {
172         void *next = vstruct_next(entry);
173         const char *invalid;
174         unsigned version = le32_to_cpu(jset->version);
175         int ret = 0;
176
177         if (journal_entry_err_on(!k->k.u64s, c,
178                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
179                         type, le64_to_cpu(jset->seq),
180                         (u64 *) entry - jset->_data,
181                         le32_to_cpu(jset->u64s),
182                         (u64 *) k - entry->_data,
183                         le16_to_cpu(entry->u64s))) {
184                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
185                 journal_entry_null_range(vstruct_next(entry), next);
186                 return FSCK_DELETED_KEY;
187         }
188
189         if (journal_entry_err_on((void *) bkey_next(k) >
190                                 (void *) vstruct_next(entry), c,
191                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
192                         type, le64_to_cpu(jset->seq),
193                         (u64 *) entry - jset->_data,
194                         le32_to_cpu(jset->u64s),
195                         (u64 *) k - entry->_data,
196                         le16_to_cpu(entry->u64s))) {
197                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
198                 journal_entry_null_range(vstruct_next(entry), next);
199                 return FSCK_DELETED_KEY;
200         }
201
202         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
203                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
204                         type, le64_to_cpu(jset->seq),
205                         (u64 *) entry - jset->_data,
206                         le32_to_cpu(jset->u64s),
207                         (u64 *) k - entry->_data,
208                         le16_to_cpu(entry->u64s),
209                         k->k.format)) {
210                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
211                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
212                 journal_entry_null_range(vstruct_next(entry), next);
213                 return FSCK_DELETED_KEY;
214         }
215
216         if (!write)
217                 bch2_bkey_compat(level, btree_id, version,
218                             JSET_BIG_ENDIAN(jset), write,
219                             NULL, bkey_to_packed(k));
220
221         invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
222                                     __btree_node_type(level, btree_id));
223         if (invalid) {
224                 char buf[160];
225
226                 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
227                 mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
228                                  type, le64_to_cpu(jset->seq),
229                                  (u64 *) entry - jset->_data,
230                                  le32_to_cpu(jset->u64s),
231                                  (u64 *) k - entry->_data,
232                                  le16_to_cpu(entry->u64s),
233                                  invalid, buf);
234
235                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
236                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
237                 journal_entry_null_range(vstruct_next(entry), next);
238                 return FSCK_DELETED_KEY;
239         }
240
241         if (write)
242                 bch2_bkey_compat(level, btree_id, version,
243                             JSET_BIG_ENDIAN(jset), write,
244                             NULL, bkey_to_packed(k));
245 fsck_err:
246         return ret;
247 }
248
249 static int journal_entry_validate_btree_keys(struct bch_fs *c,
250                                              struct jset *jset,
251                                              struct jset_entry *entry,
252                                              int write)
253 {
254         struct bkey_i *k = entry->start;
255
256         while (k != vstruct_last(entry)) {
257                 int ret = journal_validate_key(c, jset, entry,
258                                                entry->level,
259                                                entry->btree_id,
260                                                k, "key", write);
261                 if (ret == FSCK_DELETED_KEY)
262                         continue;
263
264                 k = bkey_next(k);
265         }
266
267         return 0;
268 }
269
270 static int journal_entry_validate_btree_root(struct bch_fs *c,
271                                              struct jset *jset,
272                                              struct jset_entry *entry,
273                                              int write)
274 {
275         struct bkey_i *k = entry->start;
276         int ret = 0;
277
278         if (journal_entry_err_on(!entry->u64s ||
279                                  le16_to_cpu(entry->u64s) != k->k.u64s, c,
280                                  "invalid btree root journal entry: wrong number of keys")) {
281                 void *next = vstruct_next(entry);
282                 /*
283                  * we don't want to null out this jset_entry,
284                  * just the contents, so that later we can tell
285                  * we were _supposed_ to have a btree root
286                  */
287                 entry->u64s = 0;
288                 journal_entry_null_range(vstruct_next(entry), next);
289                 return 0;
290         }
291
292         return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
293                                     "btree root", write);
294 fsck_err:
295         return ret;
296 }
297
298 static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
299                                             struct jset *jset,
300                                             struct jset_entry *entry,
301                                             int write)
302 {
303         /* obsolete, don't care: */
304         return 0;
305 }
306
307 static int journal_entry_validate_blacklist(struct bch_fs *c,
308                                             struct jset *jset,
309                                             struct jset_entry *entry,
310                                             int write)
311 {
312         int ret = 0;
313
314         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
315                 "invalid journal seq blacklist entry: bad size")) {
316                 journal_entry_null_range(entry, vstruct_next(entry));
317         }
318 fsck_err:
319         return ret;
320 }
321
322 static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
323                                                struct jset *jset,
324                                                struct jset_entry *entry,
325                                                int write)
326 {
327         struct jset_entry_blacklist_v2 *bl_entry;
328         int ret = 0;
329
330         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
331                 "invalid journal seq blacklist entry: bad size")) {
332                 journal_entry_null_range(entry, vstruct_next(entry));
333                 goto out;
334         }
335
336         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
337
338         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
339                                  le64_to_cpu(bl_entry->end), c,
340                 "invalid journal seq blacklist entry: start > end")) {
341                 journal_entry_null_range(entry, vstruct_next(entry));
342         }
343 out:
344 fsck_err:
345         return ret;
346 }
347
348 static int journal_entry_validate_usage(struct bch_fs *c,
349                                         struct jset *jset,
350                                         struct jset_entry *entry,
351                                         int write)
352 {
353         struct jset_entry_usage *u =
354                 container_of(entry, struct jset_entry_usage, entry);
355         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
356         int ret = 0;
357
358         if (journal_entry_err_on(bytes < sizeof(*u),
359                                  c,
360                                  "invalid journal entry usage: bad size")) {
361                 journal_entry_null_range(entry, vstruct_next(entry));
362                 return ret;
363         }
364
365 fsck_err:
366         return ret;
367 }
368
369 static int journal_entry_validate_data_usage(struct bch_fs *c,
370                                         struct jset *jset,
371                                         struct jset_entry *entry,
372                                         int write)
373 {
374         struct jset_entry_data_usage *u =
375                 container_of(entry, struct jset_entry_data_usage, entry);
376         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
377         int ret = 0;
378
379         if (journal_entry_err_on(bytes < sizeof(*u) ||
380                                  bytes < sizeof(*u) + u->r.nr_devs,
381                                  c,
382                                  "invalid journal entry usage: bad size")) {
383                 journal_entry_null_range(entry, vstruct_next(entry));
384                 return ret;
385         }
386
387 fsck_err:
388         return ret;
389 }
390
391 struct jset_entry_ops {
392         int (*validate)(struct bch_fs *, struct jset *,
393                         struct jset_entry *, int);
394 };
395
396 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
397 #define x(f, nr)                                                \
398         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
399                 .validate       = journal_entry_validate_##f,   \
400         },
401         BCH_JSET_ENTRY_TYPES()
402 #undef x
403 };
404
405 static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
406                                   struct jset_entry *entry, int write)
407 {
408         return entry->type < BCH_JSET_ENTRY_NR
409                 ? bch2_jset_entry_ops[entry->type].validate(c, jset,
410                                                             entry, write)
411                 : 0;
412 }
413
414 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
415                                  int write)
416 {
417         struct jset_entry *entry;
418         int ret = 0;
419
420         vstruct_for_each(jset, entry) {
421                 if (journal_entry_err_on(vstruct_next(entry) >
422                                          vstruct_last(jset), c,
423                                 "journal entry extends past end of jset")) {
424                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
425                         break;
426                 }
427
428                 ret = journal_entry_validate(c, jset, entry, write);
429                 if (ret)
430                         break;
431         }
432 fsck_err:
433         return ret;
434 }
435
436 static int jset_validate(struct bch_fs *c,
437                          struct bch_dev *ca,
438                          struct jset *jset, u64 sector,
439                          unsigned bucket_sectors_left,
440                          unsigned sectors_read,
441                          int write)
442 {
443         size_t bytes = vstruct_bytes(jset);
444         struct bch_csum csum;
445         unsigned version;
446         int ret = 0;
447
448         if (le64_to_cpu(jset->magic) != jset_magic(c))
449                 return JOURNAL_ENTRY_NONE;
450
451         version = le32_to_cpu(jset->version);
452         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
453                                   version < bcachefs_metadata_version_min) ||
454                                  version >= bcachefs_metadata_version_max, c,
455                         "%s sector %llu seq %llu: unknown journal entry version %u",
456                         ca->name, sector, le64_to_cpu(jset->seq),
457                         version)) {
458                 /* don't try to continue: */
459                 return EINVAL;
460         }
461
462         if (bytes > (sectors_read << 9) &&
463             sectors_read < bucket_sectors_left)
464                 return JOURNAL_ENTRY_REREAD;
465
466         if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
467                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
468                         ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
469                 ret = JOURNAL_ENTRY_BAD;
470                 le32_add_cpu(&jset->u64s,
471                              -((bytes - (bucket_sectors_left << 9)) / 8));
472         }
473
474         if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
475                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
476                         ca->name, sector, le64_to_cpu(jset->seq),
477                         JSET_CSUM_TYPE(jset))) {
478                 ret = JOURNAL_ENTRY_BAD;
479                 goto bad_csum_type;
480         }
481
482         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
483         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
484                                  "%s sector %llu seq %llu: journal checksum bad",
485                                  ca->name, sector, le64_to_cpu(jset->seq)))
486                 ret = JOURNAL_ENTRY_BAD;
487
488         bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
489                      jset->encrypted_start,
490                      vstruct_end(jset) - (void *) jset->encrypted_start);
491 bad_csum_type:
492         if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
493                                  "invalid journal entry: last_seq > seq")) {
494                 jset->last_seq = jset->seq;
495                 return JOURNAL_ENTRY_BAD;
496         }
497 fsck_err:
498         return ret;
499 }
500
501 struct journal_read_buf {
502         void            *data;
503         size_t          size;
504 };
505
506 static int journal_read_buf_realloc(struct journal_read_buf *b,
507                                     size_t new_size)
508 {
509         void *n;
510
511         /* the bios are sized for this many pages, max: */
512         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
513                 return -ENOMEM;
514
515         new_size = roundup_pow_of_two(new_size);
516         n = kvpmalloc(new_size, GFP_KERNEL);
517         if (!n)
518                 return -ENOMEM;
519
520         kvpfree(b->data, b->size);
521         b->data = n;
522         b->size = new_size;
523         return 0;
524 }
525
526 static int journal_read_bucket(struct bch_dev *ca,
527                                struct journal_read_buf *buf,
528                                struct journal_list *jlist,
529                                unsigned bucket)
530 {
531         struct bch_fs *c = ca->fs;
532         struct journal_device *ja = &ca->journal;
533         struct jset *j = NULL;
534         unsigned sectors, sectors_read = 0;
535         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
536             end = offset + ca->mi.bucket_size;
537         bool saw_bad = false;
538         int ret = 0;
539
540         pr_debug("reading %u", bucket);
541
542         while (offset < end) {
543                 if (!sectors_read) {
544                         struct bio *bio;
545 reread:
546                         sectors_read = min_t(unsigned,
547                                 end - offset, buf->size >> 9);
548
549                         bio = bio_kmalloc(GFP_KERNEL,
550                                           buf_pages(buf->data,
551                                                     sectors_read << 9));
552                         bio_set_dev(bio, ca->disk_sb.bdev);
553                         bio->bi_iter.bi_sector  = offset;
554                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
555                         bch2_bio_map(bio, buf->data, sectors_read << 9);
556
557                         ret = submit_bio_wait(bio);
558                         bio_put(bio);
559
560                         if (bch2_dev_io_err_on(ret, ca,
561                                                "journal read from sector %llu",
562                                                offset) ||
563                             bch2_meta_read_fault("journal"))
564                                 return -EIO;
565
566                         j = buf->data;
567                 }
568
569                 ret = jset_validate(c, ca, j, offset,
570                                     end - offset, sectors_read,
571                                     READ);
572                 switch (ret) {
573                 case BCH_FSCK_OK:
574                         sectors = vstruct_sectors(j, c->block_bits);
575                         break;
576                 case JOURNAL_ENTRY_REREAD:
577                         if (vstruct_bytes(j) > buf->size) {
578                                 ret = journal_read_buf_realloc(buf,
579                                                         vstruct_bytes(j));
580                                 if (ret)
581                                         return ret;
582                         }
583                         goto reread;
584                 case JOURNAL_ENTRY_NONE:
585                         if (!saw_bad)
586                                 return 0;
587                         sectors = c->opts.block_size;
588                         goto next_block;
589                 case JOURNAL_ENTRY_BAD:
590                         saw_bad = true;
591                         /*
592                          * On checksum error we don't really trust the size
593                          * field of the journal entry we read, so try reading
594                          * again at next block boundary:
595                          */
596                         sectors = c->opts.block_size;
597                         break;
598                 default:
599                         return ret;
600                 }
601
602                 /*
603                  * This happens sometimes if we don't have discards on -
604                  * when we've partially overwritten a bucket with new
605                  * journal entries. We don't need the rest of the
606                  * bucket:
607                  */
608                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
609                         return 0;
610
611                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
612
613                 mutex_lock(&jlist->lock);
614                 ret = journal_entry_add(c, ca, jlist, j, ret != 0);
615                 mutex_unlock(&jlist->lock);
616
617                 switch (ret) {
618                 case JOURNAL_ENTRY_ADD_OK:
619                         break;
620                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
621                         break;
622                 default:
623                         return ret;
624                 }
625 next_block:
626                 pr_debug("next");
627                 offset          += sectors;
628                 sectors_read    -= sectors;
629                 j = ((void *) j) + (sectors << 9);
630         }
631
632         return 0;
633 }
634
635 static void bch2_journal_read_device(struct closure *cl)
636 {
637         struct journal_device *ja =
638                 container_of(cl, struct journal_device, read);
639         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
640         struct journal_list *jlist =
641                 container_of(cl->parent, struct journal_list, cl);
642         struct journal_read_buf buf = { NULL, 0 };
643         u64 min_seq = U64_MAX;
644         unsigned i;
645         int ret;
646
647         if (!ja->nr)
648                 goto out;
649
650         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
651         if (ret)
652                 goto err;
653
654         pr_debug("%u journal buckets", ja->nr);
655
656         for (i = 0; i < ja->nr; i++) {
657                 ret = journal_read_bucket(ca, &buf, jlist, i);
658                 if (ret)
659                         goto err;
660         }
661
662         /* Find the journal bucket with the highest sequence number: */
663         for (i = 0; i < ja->nr; i++) {
664                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
665                         ja->cur_idx = i;
666
667                 min_seq = min(ja->bucket_seq[i], min_seq);
668         }
669
670         /*
671          * If there's duplicate journal entries in multiple buckets (which
672          * definitely isn't supposed to happen, but...) - make sure to start
673          * cur_idx at the last of those buckets, so we don't deadlock trying to
674          * allocate
675          */
676         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
677                ja->bucket_seq[ja->cur_idx] >
678                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
679                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
680
681         ja->sectors_free = 0;
682
683         /*
684          * Set dirty_idx to indicate the entire journal is full and needs to be
685          * reclaimed - journal reclaim will immediately reclaim whatever isn't
686          * pinned when it first runs:
687          */
688         ja->discard_idx = ja->dirty_idx_ondisk =
689                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
690 out:
691         kvpfree(buf.data, buf.size);
692         percpu_ref_put(&ca->io_ref);
693         closure_return(cl);
694         return;
695 err:
696         mutex_lock(&jlist->lock);
697         jlist->ret = ret;
698         mutex_unlock(&jlist->lock);
699         goto out;
700 }
701
702 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
703 {
704         struct journal_list jlist;
705         struct journal_replay *i;
706         struct bch_dev *ca;
707         unsigned iter;
708         size_t keys = 0, entries = 0;
709         bool degraded = false;
710         int ret = 0;
711
712         closure_init_stack(&jlist.cl);
713         mutex_init(&jlist.lock);
714         jlist.head = list;
715         jlist.ret = 0;
716
717         for_each_member_device(ca, c, iter) {
718                 if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
719                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
720                         continue;
721
722                 if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
723                      ca->mi.state == BCH_MEMBER_STATE_RO) &&
724                     percpu_ref_tryget(&ca->io_ref))
725                         closure_call(&ca->journal.read,
726                                      bch2_journal_read_device,
727                                      system_unbound_wq,
728                                      &jlist.cl);
729                 else
730                         degraded = true;
731         }
732
733         closure_sync(&jlist.cl);
734
735         if (jlist.ret)
736                 return jlist.ret;
737
738         list_for_each_entry(i, list, list) {
739                 struct jset_entry *entry;
740                 struct bkey_i *k, *_n;
741                 struct bch_replicas_padded replicas;
742                 char buf[80];
743
744                 ret = jset_validate_entries(c, &i->j, READ);
745                 if (ret)
746                         goto fsck_err;
747
748                 /*
749                  * If we're mounting in degraded mode - if we didn't read all
750                  * the devices - this is wrong:
751                  */
752
753                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
754
755                 if (!degraded &&
756                     (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
757                      fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
758                                  "superblock not marked as containing replicas %s",
759                                  (bch2_replicas_entry_to_text(&PBUF(buf),
760                                                               &replicas.e), buf)))) {
761                         ret = bch2_mark_replicas(c, &replicas.e);
762                         if (ret)
763                                 return ret;
764                 }
765
766                 for_each_jset_key(k, _n, entry, &i->j)
767                         keys++;
768                 entries++;
769         }
770
771         if (!list_empty(list)) {
772                 i = list_last_entry(list, struct journal_replay, list);
773
774                 bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
775                          keys, entries, le64_to_cpu(i->j.seq));
776         }
777 fsck_err:
778         return ret;
779 }
780
781 /* journal write: */
782
783 static void __journal_write_alloc(struct journal *j,
784                                   struct journal_buf *w,
785                                   struct dev_alloc_list *devs_sorted,
786                                   unsigned sectors,
787                                   unsigned *replicas,
788                                   unsigned replicas_want)
789 {
790         struct bch_fs *c = container_of(j, struct bch_fs, journal);
791         struct journal_device *ja;
792         struct bch_dev *ca;
793         unsigned i;
794
795         if (*replicas >= replicas_want)
796                 return;
797
798         for (i = 0; i < devs_sorted->nr; i++) {
799                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
800                 if (!ca)
801                         continue;
802
803                 ja = &ca->journal;
804
805                 /*
806                  * Check that we can use this device, and aren't already using
807                  * it:
808                  */
809                 if (!ca->mi.durability ||
810                     ca->mi.state != BCH_MEMBER_STATE_RW ||
811                     !ja->nr ||
812                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
813                                          ca->dev_idx) ||
814                     sectors > ja->sectors_free)
815                         continue;
816
817                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
818
819                 bch2_bkey_append_ptr(&w->key,
820                         (struct bch_extent_ptr) {
821                                   .offset = bucket_to_sector(ca,
822                                         ja->buckets[ja->cur_idx]) +
823                                         ca->mi.bucket_size -
824                                         ja->sectors_free,
825                                   .dev = ca->dev_idx,
826                 });
827
828                 ja->sectors_free -= sectors;
829                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
830
831                 *replicas += ca->mi.durability;
832
833                 if (*replicas >= replicas_want)
834                         break;
835         }
836 }
837
838 /**
839  * journal_next_bucket - move on to the next journal bucket if possible
840  */
841 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
842                                unsigned sectors)
843 {
844         struct bch_fs *c = container_of(j, struct bch_fs, journal);
845         struct journal_device *ja;
846         struct bch_dev *ca;
847         struct dev_alloc_list devs_sorted;
848         unsigned i, replicas = 0, replicas_want =
849                 READ_ONCE(c->opts.metadata_replicas);
850
851         rcu_read_lock();
852
853         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
854                                           &c->rw_devs[BCH_DATA_journal]);
855
856         __journal_write_alloc(j, w, &devs_sorted,
857                               sectors, &replicas, replicas_want);
858
859         if (replicas >= replicas_want)
860                 goto done;
861
862         for (i = 0; i < devs_sorted.nr; i++) {
863                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
864                 if (!ca)
865                         continue;
866
867                 ja = &ca->journal;
868
869                 if (sectors > ja->sectors_free &&
870                     sectors <= ca->mi.bucket_size &&
871                     bch2_journal_dev_buckets_available(j, ja,
872                                         journal_space_discarded)) {
873                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
874                         ja->sectors_free = ca->mi.bucket_size;
875
876                         /*
877                          * ja->bucket_seq[ja->cur_idx] must always have
878                          * something sensible:
879                          */
880                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
881                 }
882         }
883
884         __journal_write_alloc(j, w, &devs_sorted,
885                               sectors, &replicas, replicas_want);
886 done:
887         rcu_read_unlock();
888
889         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
890 }
891
892 static void journal_write_compact(struct jset *jset)
893 {
894         struct jset_entry *i, *next, *prev = NULL;
895
896         /*
897          * Simple compaction, dropping empty jset_entries (from journal
898          * reservations that weren't fully used) and merging jset_entries that
899          * can be.
900          *
901          * If we wanted to be really fancy here, we could sort all the keys in
902          * the jset and drop keys that were overwritten - probably not worth it:
903          */
904         vstruct_for_each_safe(jset, i, next) {
905                 unsigned u64s = le16_to_cpu(i->u64s);
906
907                 /* Empty entry: */
908                 if (!u64s)
909                         continue;
910
911                 /* Can we merge with previous entry? */
912                 if (prev &&
913                     i->btree_id == prev->btree_id &&
914                     i->level    == prev->level &&
915                     i->type     == prev->type &&
916                     i->type     == BCH_JSET_ENTRY_btree_keys &&
917                     le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
918                         memmove_u64s_down(vstruct_next(prev),
919                                           i->_data,
920                                           u64s);
921                         le16_add_cpu(&prev->u64s, u64s);
922                         continue;
923                 }
924
925                 /* Couldn't merge, move i into new position (after prev): */
926                 prev = prev ? vstruct_next(prev) : jset->start;
927                 if (i != prev)
928                         memmove_u64s_down(prev, i, jset_u64s(u64s));
929         }
930
931         prev = prev ? vstruct_next(prev) : jset->start;
932         jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
933 }
934
935 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
936 {
937         /* we aren't holding j->lock: */
938         unsigned new_size = READ_ONCE(j->buf_size_want);
939         void *new_buf;
940
941         if (buf->buf_size >= new_size)
942                 return;
943
944         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
945         if (!new_buf)
946                 return;
947
948         memcpy(new_buf, buf->data, buf->buf_size);
949         kvpfree(buf->data, buf->buf_size);
950         buf->data       = new_buf;
951         buf->buf_size   = new_size;
952 }
953
954 static void journal_write_done(struct closure *cl)
955 {
956         struct journal *j = container_of(cl, struct journal, io);
957         struct bch_fs *c = container_of(j, struct bch_fs, journal);
958         struct journal_buf *w = journal_prev_buf(j);
959         struct bch_devs_list devs =
960                 bch2_bkey_devs(bkey_i_to_s_c(&w->key));
961         struct bch_replicas_padded replicas;
962         u64 seq = le64_to_cpu(w->data->seq);
963         u64 last_seq = le64_to_cpu(w->data->last_seq);
964         int err = 0;
965
966         bch2_time_stats_update(j->write_time, j->write_start_time);
967
968         if (!devs.nr) {
969                 bch_err(c, "unable to write journal to sufficient devices");
970                 err = -EIO;
971         } else {
972                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
973                 if (bch2_mark_replicas(c, &replicas.e))
974                         err = -EIO;
975         }
976
977         if (err)
978                 bch2_fatal_error(c);
979
980         spin_lock(&j->lock);
981         if (seq >= j->pin.front)
982                 journal_seq_pin(j, seq)->devs = devs;
983
984         j->seq_ondisk           = seq;
985         if (err && (!j->err_seq || seq < j->err_seq))
986                 j->err_seq      = seq;
987         j->last_seq_ondisk      = last_seq;
988         bch2_journal_space_available(j);
989
990         /*
991          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
992          * more buckets:
993          *
994          * Must come before signaling write completion, for
995          * bch2_fs_journal_stop():
996          */
997         mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
998
999         /* also must come before signalling write completion: */
1000         closure_debug_destroy(cl);
1001
1002         BUG_ON(!j->reservations.prev_buf_unwritten);
1003         atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
1004                      &j->reservations.counter);
1005
1006         closure_wake_up(&w->wait);
1007         journal_wake(j);
1008
1009         if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
1010                 mod_delayed_work(system_freezable_wq, &j->write_work, 0);
1011         spin_unlock(&j->lock);
1012 }
1013
1014 static void journal_write_endio(struct bio *bio)
1015 {
1016         struct bch_dev *ca = bio->bi_private;
1017         struct journal *j = &ca->fs->journal;
1018
1019         if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
1020                                bch2_blk_status_to_str(bio->bi_status)) ||
1021             bch2_meta_write_fault("journal")) {
1022                 struct journal_buf *w = journal_prev_buf(j);
1023                 unsigned long flags;
1024
1025                 spin_lock_irqsave(&j->err_lock, flags);
1026                 bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
1027                 spin_unlock_irqrestore(&j->err_lock, flags);
1028         }
1029
1030         closure_put(&j->io);
1031         percpu_ref_put(&ca->io_ref);
1032 }
1033
1034 void bch2_journal_write(struct closure *cl)
1035 {
1036         struct journal *j = container_of(cl, struct journal, io);
1037         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1038         struct bch_dev *ca;
1039         struct journal_buf *w = journal_prev_buf(j);
1040         struct jset_entry *start, *end;
1041         struct jset *jset;
1042         struct bio *bio;
1043         struct bch_extent_ptr *ptr;
1044         bool validate_before_checksum = false;
1045         unsigned i, sectors, bytes, u64s;
1046         int ret;
1047
1048         bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
1049
1050         journal_buf_realloc(j, w);
1051         jset = w->data;
1052
1053         j->write_start_time = local_clock();
1054
1055         /*
1056          * New btree roots are set by journalling them; when the journal entry
1057          * gets written we have to propagate them to c->btree_roots
1058          *
1059          * But, every journal entry we write has to contain all the btree roots
1060          * (at least for now); so after we copy btree roots to c->btree_roots we
1061          * have to get any missing btree roots and add them to this journal
1062          * entry:
1063          */
1064
1065         bch2_journal_entries_to_btree_roots(c, jset);
1066
1067         start = end = vstruct_last(jset);
1068
1069         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1070
1071         end     = bch2_journal_super_entries_add_common(c, end,
1072                                                 le64_to_cpu(jset->seq));
1073         u64s    = (u64 *) end - (u64 *) start;
1074         BUG_ON(u64s > j->entry_u64s_reserved);
1075
1076         le32_add_cpu(&jset->u64s, u64s);
1077         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1078
1079         journal_write_compact(jset);
1080
1081         jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
1082         jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
1083         jset->magic             = cpu_to_le64(jset_magic(c));
1084
1085         jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
1086                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1087                 : cpu_to_le32(c->sb.version);
1088
1089         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1090         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1091
1092         if (journal_entry_empty(jset))
1093                 j->last_empty_seq = le64_to_cpu(jset->seq);
1094
1095         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1096                 validate_before_checksum = true;
1097
1098         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
1099                 validate_before_checksum = true;
1100
1101         if (validate_before_checksum &&
1102             jset_validate_entries(c, jset, WRITE))
1103                 goto err;
1104
1105         bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1106                     jset->encrypted_start,
1107                     vstruct_end(jset) - (void *) jset->encrypted_start);
1108
1109         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1110                                   journal_nonce(jset), jset);
1111
1112         if (!validate_before_checksum &&
1113             jset_validate_entries(c, jset, WRITE))
1114                 goto err;
1115
1116         sectors = vstruct_sectors(jset, c->block_bits);
1117         BUG_ON(sectors > w->sectors);
1118
1119         bytes = vstruct_bytes(jset);
1120         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1121
1122 retry_alloc:
1123         spin_lock(&j->lock);
1124         ret = journal_write_alloc(j, w, sectors);
1125
1126         if (ret && j->can_discard) {
1127                 spin_unlock(&j->lock);
1128                 bch2_journal_do_discards(j);
1129                 goto retry_alloc;
1130         }
1131
1132         /*
1133          * write is allocated, no longer need to account for it in
1134          * bch2_journal_space_available():
1135          */
1136         w->sectors = 0;
1137
1138         /*
1139          * journal entry has been compacted and allocated, recalculate space
1140          * available:
1141          */
1142         bch2_journal_space_available(j);
1143         spin_unlock(&j->lock);
1144
1145         if (ret) {
1146                 bch_err(c, "Unable to allocate journal write");
1147                 bch2_fatal_error(c);
1148                 continue_at(cl, journal_write_done, system_highpri_wq);
1149                 return;
1150         }
1151
1152         /*
1153          * XXX: we really should just disable the entire journal in nochanges
1154          * mode
1155          */
1156         if (c->opts.nochanges)
1157                 goto no_io;
1158
1159         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1160                 ca = bch_dev_bkey_exists(c, ptr->dev);
1161                 if (!percpu_ref_tryget(&ca->io_ref)) {
1162                         /* XXX: fix this */
1163                         bch_err(c, "missing device for journal write\n");
1164                         continue;
1165                 }
1166
1167                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1168                              sectors);
1169
1170                 bio = ca->journal.bio;
1171                 bio_reset(bio);
1172                 bio_set_dev(bio, ca->disk_sb.bdev);
1173                 bio->bi_iter.bi_sector  = ptr->offset;
1174                 bio->bi_end_io          = journal_write_endio;
1175                 bio->bi_private         = ca;
1176                 bio_set_op_attrs(bio, REQ_OP_WRITE,
1177                                  REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
1178                 bch2_bio_map(bio, jset, sectors << 9);
1179
1180                 trace_journal_write(bio);
1181                 closure_bio_submit(bio, cl);
1182
1183                 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
1184         }
1185
1186         for_each_rw_member(ca, c, i)
1187                 if (journal_flushes_device(ca) &&
1188                     !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
1189                         percpu_ref_get(&ca->io_ref);
1190
1191                         bio = ca->journal.bio;
1192                         bio_reset(bio);
1193                         bio_set_dev(bio, ca->disk_sb.bdev);
1194                         bio->bi_opf             = REQ_OP_FLUSH;
1195                         bio->bi_end_io          = journal_write_endio;
1196                         bio->bi_private         = ca;
1197                         closure_bio_submit(bio, cl);
1198                 }
1199
1200 no_io:
1201         bch2_bucket_seq_cleanup(c);
1202
1203         continue_at(cl, journal_write_done, system_highpri_wq);
1204         return;
1205 err:
1206         bch2_inconsistent_error(c);
1207         continue_at(cl, journal_write_done, system_highpri_wq);
1208 }