git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_io.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 #include "bcachefs.h"
   3 #include "alloc_foreground.h"
   4 #include "btree_io.h"
   5 #include "btree_update_interior.h"
   6 #include "buckets.h"
   7 #include "checksum.h"
   8 #include "error.h"
   9 #include "io.h"
  10 #include "journal.h"
  11 #include "journal_io.h"
  12 #include "journal_reclaim.h"
  13 #include "replicas.h"
  14
  15 #include <trace/events/bcachefs.h>
  16
  17 struct journal_list {
  18         struct closure          cl;
  19         struct mutex            lock;
  20         struct list_head        *head;
  21         int                     ret;
  22 };
  23
  24 #define JOURNAL_ENTRY_ADD_OK            0
  25 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE  5
  26
  27 /*
  28  * Given a journal entry we just read, add it to the list of journal entries to
  29  * be replayed:
  30  */
  31 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
  32                              struct journal_list *jlist, struct jset *j,
  33                              bool bad)
  34 {
  35         struct journal_replay *i, *pos;
  36         struct bch_devs_list devs = { .nr = 0 };
  37         struct list_head *where;
  38         size_t bytes = vstruct_bytes(j);
  39         __le64 last_seq;
  40         int ret;
  41
  42         last_seq = !list_empty(jlist->head)
  43                 ? list_last_entry(jlist->head, struct journal_replay,
  44                                   list)->j.last_seq
  45                 : 0;
  46
  47         if (!c->opts.read_entire_journal) {
  48                 /* Is this entry older than the range we need? */
  49                 if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
  50                         ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
  51                         goto out;
  52                 }
  53
  54                 /* Drop entries we don't need anymore */
  55                 list_for_each_entry_safe(i, pos, jlist->head, list) {
  56                         if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
  57                                 break;
  58                         list_del(&i->list);
  59                         kvpfree(i, offsetof(struct journal_replay, j) +
  60                                 vstruct_bytes(&i->j));
  61                 }
  62         }
  63
  64         list_for_each_entry_reverse(i, jlist->head, list) {
  65                 if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
  66                         where = &i->list;
  67                         goto add;
  68                 }
  69         }
  70
  71         where = jlist->head;
  72 add:
  73         i = where->next != jlist->head
  74                 ? container_of(where->next, struct journal_replay, list)
  75                 : NULL;
  76
  77         /*
  78          * Duplicate journal entries? If so we want the one that didn't have a
  79          * checksum error:
  80          */
  81         if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
  82                 if (i->bad) {
  83                         devs = i->devs;
  84                         list_del(&i->list);
  85                         kvpfree(i, offsetof(struct journal_replay, j) +
  86                                 vstruct_bytes(&i->j));
  87                 } else if (bad) {
  88                         goto found;
  89                 } else {
  90                         fsck_err_on(bytes != vstruct_bytes(&i->j) ||
  91                                     memcmp(j, &i->j, bytes), c,
  92                                     "found duplicate but non identical journal entries (seq %llu)",
  93                                     le64_to_cpu(j->seq));
  94                         goto found;
  95                 }
  96
  97         }
  98
  99         i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 100         if (!i) {
 101                 ret = -ENOMEM;
 102                 goto out;
 103         }
 104
 105         list_add(&i->list, where);
 106         i->devs = devs;
 107         i->bad  = bad;
 108         memcpy(&i->j, j, bytes);
 109 found:
 110         if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
 111                 bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
 112         else
 113                 fsck_err_on(1, c, "duplicate journal entries on same device");
 114         ret = JOURNAL_ENTRY_ADD_OK;
 115 out:
 116 fsck_err:
 117         return ret;
 118 }
 119
 120 static struct nonce journal_nonce(const struct jset *jset)
 121 {
 122         return (struct nonce) {{
 123                 [0] = 0,
 124                 [1] = ((__le32 *) &jset->seq)[0],
 125                 [2] = ((__le32 *) &jset->seq)[1],
 126                 [3] = BCH_NONCE_JOURNAL,
 127         }};
 128 }
 129
 130 /* this fills in a range with empty jset_entries: */
 131 static void journal_entry_null_range(void *start, void *end)
 132 {
 133         struct jset_entry *entry;
 134
 135         for (entry = start; entry != end; entry = vstruct_next(entry))
 136                 memset(entry, 0, sizeof(*entry));
 137 }
 138
 139 #define JOURNAL_ENTRY_REREAD    5
 140 #define JOURNAL_ENTRY_NONE      6
 141 #define JOURNAL_ENTRY_BAD       7
 142
 143 #define journal_entry_err(c, msg, ...)                                  \
 144 ({                                                                      \
 145         switch (write) {                                                \
 146         case READ:                                                      \
 147                 mustfix_fsck_err(c, msg, ##__VA_ARGS__);                \
 148                 break;                                                  \
 149         case WRITE:                                                     \
 150                 bch_err(c, "corrupt metadata before write:\n"           \
 151                         msg, ##__VA_ARGS__);                            \
 152                 if (bch2_fs_inconsistent(c)) {                          \
 153                         ret = BCH_FSCK_ERRORS_NOT_FIXED;                \
 154                         goto fsck_err;                                  \
 155                 }                                                       \
 156                 break;                                                  \
 157         }                                                               \
 158         true;                                                           \
 159 })
 160
 161 #define journal_entry_err_on(cond, c, msg, ...)                         \
 162         ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 163
 164 #define FSCK_DELETED_KEY        5
 165
 166 static int journal_validate_key(struct bch_fs *c, struct jset *jset,
 167                                 struct jset_entry *entry,
 168                                 unsigned level, enum btree_id btree_id,
 169                                 struct bkey_i *k,
 170                                 const char *type, int write)
 171 {
 172         void *next = vstruct_next(entry);
 173         const char *invalid;
 174         unsigned version = le32_to_cpu(jset->version);
 175         int ret = 0;
 176
 177         if (journal_entry_err_on(!k->k.u64s, c,
 178                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
 179                         type, le64_to_cpu(jset->seq),
 180                         (u64 *) entry - jset->_data,
 181                         le32_to_cpu(jset->u64s),
 182                         (u64 *) k - entry->_data,
 183                         le16_to_cpu(entry->u64s))) {
 184                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 185                 journal_entry_null_range(vstruct_next(entry), next);
 186                 return FSCK_DELETED_KEY;
 187         }
 188
 189         if (journal_entry_err_on((void *) bkey_next(k) >
 190                                 (void *) vstruct_next(entry), c,
 191                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
 192                         type, le64_to_cpu(jset->seq),
 193                         (u64 *) entry - jset->_data,
 194                         le32_to_cpu(jset->u64s),
 195                         (u64 *) k - entry->_data,
 196                         le16_to_cpu(entry->u64s))) {
 197                 entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 198                 journal_entry_null_range(vstruct_next(entry), next);
 199                 return FSCK_DELETED_KEY;
 200         }
 201
 202         if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
 203                         "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
 204                         type, le64_to_cpu(jset->seq),
 205                         (u64 *) entry - jset->_data,
 206                         le32_to_cpu(jset->u64s),
 207                         (u64 *) k - entry->_data,
 208                         le16_to_cpu(entry->u64s),
 209                         k->k.format)) {
 210                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 211                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 212                 journal_entry_null_range(vstruct_next(entry), next);
 213                 return FSCK_DELETED_KEY;
 214         }
 215
 216         if (!write)
 217                 bch2_bkey_compat(level, btree_id, version,
 218                             JSET_BIG_ENDIAN(jset), write,
 219                             NULL, bkey_to_packed(k));
 220
 221         invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 222                                     __btree_node_type(level, btree_id));
 223         if (invalid) {
 224                 char buf[160];
 225
 226                 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
 227                 mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
 228                                  type, le64_to_cpu(jset->seq),
 229                                  (u64 *) entry - jset->_data,
 230                                  le32_to_cpu(jset->u64s),
 231                                  (u64 *) k - entry->_data,
 232                                  le16_to_cpu(entry->u64s),
 233                                  invalid, buf);
 234
 235                 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 236                 memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 237                 journal_entry_null_range(vstruct_next(entry), next);
 238                 return FSCK_DELETED_KEY;
 239         }
 240
 241         if (write)
 242                 bch2_bkey_compat(level, btree_id, version,
 243                             JSET_BIG_ENDIAN(jset), write,
 244                             NULL, bkey_to_packed(k));
 245 fsck_err:
 246         return ret;
 247 }
 248
 249 static int journal_entry_validate_btree_keys(struct bch_fs *c,
 250                                              struct jset *jset,
 251                                              struct jset_entry *entry,
 252                                              int write)
 253 {
 254         struct bkey_i *k = entry->start;
 255
 256         while (k != vstruct_last(entry)) {
 257                 int ret = journal_validate_key(c, jset, entry,
 258                                                entry->level,
 259                                                entry->btree_id,
 260                                                k, "key", write);
 261                 if (ret == FSCK_DELETED_KEY)
 262                         continue;
 263
 264                 k = bkey_next(k);
 265         }
 266
 267         return 0;
 268 }
 269
 270 static int journal_entry_validate_btree_root(struct bch_fs *c,
 271                                              struct jset *jset,
 272                                              struct jset_entry *entry,
 273                                              int write)
 274 {
 275         struct bkey_i *k = entry->start;
 276         int ret = 0;
 277
 278         if (journal_entry_err_on(!entry->u64s ||
 279                                  le16_to_cpu(entry->u64s) != k->k.u64s, c,
 280                                  "invalid btree root journal entry: wrong number of keys")) {
 281                 void *next = vstruct_next(entry);
 282                 /*
 283                  * we don't want to null out this jset_entry,
 284                  * just the contents, so that later we can tell
 285                  * we were _supposed_ to have a btree root
 286                  */
 287                 entry->u64s = 0;
 288                 journal_entry_null_range(vstruct_next(entry), next);
 289                 return 0;
 290         }
 291
 292         return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
 293                                     "btree root", write);
 294 fsck_err:
 295         return ret;
 296 }
 297
 298 static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
 299                                             struct jset *jset,
 300                                             struct jset_entry *entry,
 301                                             int write)
 302 {
 303         /* obsolete, don't care: */
 304         return 0;
 305 }
 306
 307 static int journal_entry_validate_blacklist(struct bch_fs *c,
 308                                             struct jset *jset,
 309                                             struct jset_entry *entry,
 310                                             int write)
 311 {
 312         int ret = 0;
 313
 314         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
 315                 "invalid journal seq blacklist entry: bad size")) {
 316                 journal_entry_null_range(entry, vstruct_next(entry));
 317         }
 318 fsck_err:
 319         return ret;
 320 }
 321
 322 static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
 323                                                struct jset *jset,
 324                                                struct jset_entry *entry,
 325                                                int write)
 326 {
 327         struct jset_entry_blacklist_v2 *bl_entry;
 328         int ret = 0;
 329
 330         if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
 331                 "invalid journal seq blacklist entry: bad size")) {
 332                 journal_entry_null_range(entry, vstruct_next(entry));
 333                 goto out;
 334         }
 335
 336         bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
 337
 338         if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
 339                                  le64_to_cpu(bl_entry->end), c,
 340                 "invalid journal seq blacklist entry: start > end")) {
 341                 journal_entry_null_range(entry, vstruct_next(entry));
 342         }
 343 out:
 344 fsck_err:
 345         return ret;
 346 }
 347
 348 static int journal_entry_validate_usage(struct bch_fs *c,
 349                                         struct jset *jset,
 350                                         struct jset_entry *entry,
 351                                         int write)
 352 {
 353         struct jset_entry_usage *u =
 354                 container_of(entry, struct jset_entry_usage, entry);
 355         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
 356         int ret = 0;
 357
 358         if (journal_entry_err_on(bytes < sizeof(*u),
 359                                  c,
 360                                  "invalid journal entry usage: bad size")) {
 361                 journal_entry_null_range(entry, vstruct_next(entry));
 362                 return ret;
 363         }
 364
 365 fsck_err:
 366         return ret;
 367 }
 368
 369 static int journal_entry_validate_data_usage(struct bch_fs *c,
 370                                         struct jset *jset,
 371                                         struct jset_entry *entry,
 372                                         int write)
 373 {
 374         struct jset_entry_data_usage *u =
 375                 container_of(entry, struct jset_entry_data_usage, entry);
 376         unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
 377         int ret = 0;
 378
 379         if (journal_entry_err_on(bytes < sizeof(*u) ||
 380                                  bytes < sizeof(*u) + u->r.nr_devs,
 381                                  c,
 382                                  "invalid journal entry usage: bad size")) {
 383                 journal_entry_null_range(entry, vstruct_next(entry));
 384                 return ret;
 385         }
 386
 387 fsck_err:
 388         return ret;
 389 }
 390
 391 struct jset_entry_ops {
 392         int (*validate)(struct bch_fs *, struct jset *,
 393                         struct jset_entry *, int);
 394 };
 395
 396 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 397 #define x(f, nr)                                                \
 398         [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
 399                 .validate       = journal_entry_validate_##f,   \
 400         },
 401         BCH_JSET_ENTRY_TYPES()
 402 #undef x
 403 };
 404
 405 static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
 406                                   struct jset_entry *entry, int write)
 407 {
 408         return entry->type < BCH_JSET_ENTRY_NR
 409                 ? bch2_jset_entry_ops[entry->type].validate(c, jset,
 410                                                             entry, write)
 411                 : 0;
 412 }
 413
 414 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
 415                                  int write)
 416 {
 417         struct jset_entry *entry;
 418         int ret = 0;
 419
 420         vstruct_for_each(jset, entry) {
 421                 if (journal_entry_err_on(vstruct_next(entry) >
 422                                          vstruct_last(jset), c,
 423                                 "journal entry extends past end of jset")) {
 424                         jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 425                         break;
 426                 }
 427
 428                 ret = journal_entry_validate(c, jset, entry, write);
 429                 if (ret)
 430                         break;
 431         }
 432 fsck_err:
 433         return ret;
 434 }
 435
 436 static int jset_validate(struct bch_fs *c,
 437                          struct bch_dev *ca,
 438                          struct jset *jset, u64 sector,
 439                          unsigned bucket_sectors_left,
 440                          unsigned sectors_read,
 441                          int write)
 442 {
 443         size_t bytes = vstruct_bytes(jset);
 444         struct bch_csum csum;
 445         unsigned version;
 446         int ret = 0;
 447
 448         if (le64_to_cpu(jset->magic) != jset_magic(c))
 449                 return JOURNAL_ENTRY_NONE;
 450
 451         version = le32_to_cpu(jset->version);
 452         if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
 453                                   version < bcachefs_metadata_version_min) ||
 454                                  version >= bcachefs_metadata_version_max, c,
 455                         "%s sector %llu seq %llu: unknown journal entry version %u",
 456                         ca->name, sector, le64_to_cpu(jset->seq),
 457                         version)) {
 458                 /* don't try to continue: */
 459                 return EINVAL;
 460         }
 461
 462         if (bytes > (sectors_read << 9) &&
 463             sectors_read < bucket_sectors_left)
 464                 return JOURNAL_ENTRY_REREAD;
 465
 466         if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
 467                         "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
 468                         ca->name, sector, le64_to_cpu(jset->seq), bytes)) {
 469                 ret = JOURNAL_ENTRY_BAD;
 470                 le32_add_cpu(&jset->u64s,
 471                              -((bytes - (bucket_sectors_left << 9)) / 8));
 472         }
 473
 474         if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
 475                         "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 476                         ca->name, sector, le64_to_cpu(jset->seq),
 477                         JSET_CSUM_TYPE(jset))) {
 478                 ret = JOURNAL_ENTRY_BAD;
 479                 goto bad_csum_type;
 480         }
 481
 482         csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
 483         if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
 484                                  "%s sector %llu seq %llu: journal checksum bad",
 485                                  ca->name, sector, le64_to_cpu(jset->seq)))
 486                 ret = JOURNAL_ENTRY_BAD;
 487
 488         bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 489                      jset->encrypted_start,
 490                      vstruct_end(jset) - (void *) jset->encrypted_start);
 491 bad_csum_type:
 492         if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
 493                                  "invalid journal entry: last_seq > seq")) {
 494                 jset->last_seq = jset->seq;
 495                 return JOURNAL_ENTRY_BAD;
 496         }
 497 fsck_err:
 498         return ret;
 499 }
 500
 501 struct journal_read_buf {
 502         void            *data;
 503         size_t          size;
 504 };
 505
 506 static int journal_read_buf_realloc(struct journal_read_buf *b,
 507                                     size_t new_size)
 508 {
 509         void *n;
 510
 511         /* the bios are sized for this many pages, max: */
 512         if (new_size > JOURNAL_ENTRY_SIZE_MAX)
 513                 return -ENOMEM;
 514
 515         new_size = roundup_pow_of_two(new_size);
 516         n = kvpmalloc(new_size, GFP_KERNEL);
 517         if (!n)
 518                 return -ENOMEM;
 519
 520         kvpfree(b->data, b->size);
 521         b->data = n;
 522         b->size = new_size;
 523         return 0;
 524 }
 525
 526 static int journal_read_bucket(struct bch_dev *ca,
 527                                struct journal_read_buf *buf,
 528                                struct journal_list *jlist,
 529                                unsigned bucket)
 530 {
 531         struct bch_fs *c = ca->fs;
 532         struct journal_device *ja = &ca->journal;
 533         struct jset *j = NULL;
 534         unsigned sectors, sectors_read = 0;
 535         u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
 536             end = offset + ca->mi.bucket_size;
 537         bool saw_bad = false;
 538         int ret = 0;
 539
 540         pr_debug("reading %u", bucket);
 541
 542         while (offset < end) {
 543                 if (!sectors_read) {
 544                         struct bio *bio;
 545 reread:
 546                         sectors_read = min_t(unsigned,
 547                                 end - offset, buf->size >> 9);
 548
 549                         bio = bio_kmalloc(GFP_KERNEL,
 550                                           buf_pages(buf->data,
 551                                                     sectors_read << 9));
 552                         bio_set_dev(bio, ca->disk_sb.bdev);
 553                         bio->bi_iter.bi_sector  = offset;
 554                         bio_set_op_attrs(bio, REQ_OP_READ, 0);
 555                         bch2_bio_map(bio, buf->data, sectors_read << 9);
 556
 557                         ret = submit_bio_wait(bio);
 558                         bio_put(bio);
 559
 560                         if (bch2_dev_io_err_on(ret, ca,
 561                                                "journal read from sector %llu",
 562                                                offset) ||
 563                             bch2_meta_read_fault("journal"))
 564                                 return -EIO;
 565
 566                         j = buf->data;
 567                 }
 568
 569                 ret = jset_validate(c, ca, j, offset,
 570                                     end - offset, sectors_read,
 571                                     READ);
 572                 switch (ret) {
 573                 case BCH_FSCK_OK:
 574                         sectors = vstruct_sectors(j, c->block_bits);
 575                         break;
 576                 case JOURNAL_ENTRY_REREAD:
 577                         if (vstruct_bytes(j) > buf->size) {
 578                                 ret = journal_read_buf_realloc(buf,
 579                                                         vstruct_bytes(j));
 580                                 if (ret)
 581                                         return ret;
 582                         }
 583                         goto reread;
 584                 case JOURNAL_ENTRY_NONE:
 585                         if (!saw_bad)
 586                                 return 0;
 587                         sectors = c->opts.block_size;
 588                         goto next_block;
 589                 case JOURNAL_ENTRY_BAD:
 590                         saw_bad = true;
 591                         /*
 592                          * On checksum error we don't really trust the size
 593                          * field of the journal entry we read, so try reading
 594                          * again at next block boundary:
 595                          */
 596                         sectors = c->opts.block_size;
 597                         break;
 598                 default:
 599                         return ret;
 600                 }
 601
 602                 /*
 603                  * This happens sometimes if we don't have discards on -
 604                  * when we've partially overwritten a bucket with new
 605                  * journal entries. We don't need the rest of the
 606                  * bucket:
 607                  */
 608                 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
 609                         return 0;
 610
 611                 ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 612
 613                 mutex_lock(&jlist->lock);
 614                 ret = journal_entry_add(c, ca, jlist, j, ret != 0);
 615                 mutex_unlock(&jlist->lock);
 616
 617                 switch (ret) {
 618                 case JOURNAL_ENTRY_ADD_OK:
 619                         break;
 620                 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
 621                         break;
 622                 default:
 623                         return ret;
 624                 }
 625 next_block:
 626                 pr_debug("next");
 627                 offset          += sectors;
 628                 sectors_read    -= sectors;
 629                 j = ((void *) j) + (sectors << 9);
 630         }
 631
 632         return 0;
 633 }
 634
 635 static void bch2_journal_read_device(struct closure *cl)
 636 {
 637         struct journal_device *ja =
 638                 container_of(cl, struct journal_device, read);
 639         struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
 640         struct journal_list *jlist =
 641                 container_of(cl->parent, struct journal_list, cl);
 642         struct journal_read_buf buf = { NULL, 0 };
 643         u64 min_seq = U64_MAX;
 644         unsigned i;
 645         int ret;
 646
 647         if (!ja->nr)
 648                 goto out;
 649
 650         ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
 651         if (ret)
 652                 goto err;
 653
 654         pr_debug("%u journal buckets", ja->nr);
 655
 656         for (i = 0; i < ja->nr; i++) {
 657                 ret = journal_read_bucket(ca, &buf, jlist, i);
 658                 if (ret)
 659                         goto err;
 660         }
 661
 662         /* Find the journal bucket with the highest sequence number: */
 663         for (i = 0; i < ja->nr; i++) {
 664                 if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
 665                         ja->cur_idx = i;
 666
 667                 min_seq = min(ja->bucket_seq[i], min_seq);
 668         }
 669
 670         /*
 671          * If there's duplicate journal entries in multiple buckets (which
 672          * definitely isn't supposed to happen, but...) - make sure to start
 673          * cur_idx at the last of those buckets, so we don't deadlock trying to
 674          * allocate
 675          */
 676         while (ja->bucket_seq[ja->cur_idx] > min_seq &&
 677                ja->bucket_seq[ja->cur_idx] >
 678                ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
 679                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 680
 681         ja->sectors_free = 0;
 682
 683         /*
 684          * Set dirty_idx to indicate the entire journal is full and needs to be
 685          * reclaimed - journal reclaim will immediately reclaim whatever isn't
 686          * pinned when it first runs:
 687          */
 688         ja->discard_idx = ja->dirty_idx_ondisk =
 689                 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 690 out:
 691         kvpfree(buf.data, buf.size);
 692         percpu_ref_put(&ca->io_ref);
 693         closure_return(cl);
 694         return;
 695 err:
 696         mutex_lock(&jlist->lock);
 697         jlist->ret = ret;
 698         mutex_unlock(&jlist->lock);
 699         goto out;
 700 }
 701
 702 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 703 {
 704         struct journal_list jlist;
 705         struct journal_replay *i;
 706         struct bch_dev *ca;
 707         unsigned iter;
 708         size_t keys = 0, entries = 0;
 709         bool degraded = false;
 710         int ret = 0;
 711
 712         closure_init_stack(&jlist.cl);
 713         mutex_init(&jlist.lock);
 714         jlist.head = list;
 715         jlist.ret = 0;
 716
 717         for_each_member_device(ca, c, iter) {
 718                 if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
 719                     !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
 720                         continue;
 721
 722                 if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
 723                      ca->mi.state == BCH_MEMBER_STATE_RO) &&
 724                     percpu_ref_tryget(&ca->io_ref))
 725                         closure_call(&ca->journal.read,
 726                                      bch2_journal_read_device,
 727                                      system_unbound_wq,
 728                                      &jlist.cl);
 729                 else
 730                         degraded = true;
 731         }
 732
 733         closure_sync(&jlist.cl);
 734
 735         if (jlist.ret)
 736                 return jlist.ret;
 737
 738         list_for_each_entry(i, list, list) {
 739                 struct jset_entry *entry;
 740                 struct bkey_i *k, *_n;
 741                 struct bch_replicas_padded replicas;
 742                 char buf[80];
 743
 744                 ret = jset_validate_entries(c, &i->j, READ);
 745                 if (ret)
 746                         goto fsck_err;
 747
 748                 /*
 749                  * If we're mounting in degraded mode - if we didn't read all
 750                  * the devices - this is wrong:
 751                  */
 752
 753                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
 754
 755                 if (!degraded &&
 756                     (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
 757                      fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
 758                                  "superblock not marked as containing replicas %s",
 759                                  (bch2_replicas_entry_to_text(&PBUF(buf),
 760                                                               &replicas.e), buf)))) {
 761                         ret = bch2_mark_replicas(c, &replicas.e);
 762                         if (ret)
 763                                 return ret;
 764                 }
 765
 766                 for_each_jset_key(k, _n, entry, &i->j)
 767                         keys++;
 768                 entries++;
 769         }
 770
 771         if (!list_empty(list)) {
 772                 i = list_last_entry(list, struct journal_replay, list);
 773
 774                 bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
 775                          keys, entries, le64_to_cpu(i->j.seq));
 776         }
 777 fsck_err:
 778         return ret;
 779 }
 780
 781 /* journal write: */
 782
 783 static void __journal_write_alloc(struct journal *j,
 784                                   struct journal_buf *w,
 785                                   struct dev_alloc_list *devs_sorted,
 786                                   unsigned sectors,
 787                                   unsigned *replicas,
 788                                   unsigned replicas_want)
 789 {
 790         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 791         struct journal_device *ja;
 792         struct bch_dev *ca;
 793         unsigned i;
 794
 795         if (*replicas >= replicas_want)
 796                 return;
 797
 798         for (i = 0; i < devs_sorted->nr; i++) {
 799                 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
 800                 if (!ca)
 801                         continue;
 802
 803                 ja = &ca->journal;
 804
 805                 /*
 806                  * Check that we can use this device, and aren't already using
 807                  * it:
 808                  */
 809                 if (!ca->mi.durability ||
 810                     ca->mi.state != BCH_MEMBER_STATE_RW ||
 811                     !ja->nr ||
 812                     bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
 813                                          ca->dev_idx) ||
 814                     sectors > ja->sectors_free)
 815                         continue;
 816
 817                 bch2_dev_stripe_increment(ca, &j->wp.stripe);
 818
 819                 bch2_bkey_append_ptr(&w->key,
 820                         (struct bch_extent_ptr) {
 821                                   .offset = bucket_to_sector(ca,
 822                                         ja->buckets[ja->cur_idx]) +
 823                                         ca->mi.bucket_size -
 824                                         ja->sectors_free,
 825                                   .dev = ca->dev_idx,
 826                 });
 827
 828                 ja->sectors_free -= sectors;
 829                 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 830
 831                 *replicas += ca->mi.durability;
 832
 833                 if (*replicas >= replicas_want)
 834                         break;
 835         }
 836 }
 837
 838 /**
 839  * journal_next_bucket - move on to the next journal bucket if possible
 840  */
 841 static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 842                                unsigned sectors)
 843 {
 844         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 845         struct journal_device *ja;
 846         struct bch_dev *ca;
 847         struct dev_alloc_list devs_sorted;
 848         unsigned i, replicas = 0, replicas_want =
 849                 READ_ONCE(c->opts.metadata_replicas);
 850
 851         rcu_read_lock();
 852
 853         devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
 854                                           &c->rw_devs[BCH_DATA_journal]);
 855
 856         __journal_write_alloc(j, w, &devs_sorted,
 857                               sectors, &replicas, replicas_want);
 858
 859         if (replicas >= replicas_want)
 860                 goto done;
 861
 862         for (i = 0; i < devs_sorted.nr; i++) {
 863                 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
 864                 if (!ca)
 865                         continue;
 866
 867                 ja = &ca->journal;
 868
 869                 if (sectors > ja->sectors_free &&
 870                     sectors <= ca->mi.bucket_size &&
 871                     bch2_journal_dev_buckets_available(j, ja,
 872                                         journal_space_discarded)) {
 873                         ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 874                         ja->sectors_free = ca->mi.bucket_size;
 875
 876                         /*
 877                          * ja->bucket_seq[ja->cur_idx] must always have
 878                          * something sensible:
 879                          */
 880                         ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 881                 }
 882         }
 883
 884         __journal_write_alloc(j, w, &devs_sorted,
 885                               sectors, &replicas, replicas_want);
 886 done:
 887         rcu_read_unlock();
 888
 889         return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 890 }
 891
 892 static void journal_write_compact(struct jset *jset)
 893 {
 894         struct jset_entry *i, *next, *prev = NULL;
 895
 896         /*
 897          * Simple compaction, dropping empty jset_entries (from journal
 898          * reservations that weren't fully used) and merging jset_entries that
 899          * can be.
 900          *
 901          * If we wanted to be really fancy here, we could sort all the keys in
 902          * the jset and drop keys that were overwritten - probably not worth it:
 903          */
 904         vstruct_for_each_safe(jset, i, next) {
 905                 unsigned u64s = le16_to_cpu(i->u64s);
 906
 907                 /* Empty entry: */
 908                 if (!u64s)
 909                         continue;
 910
 911                 /* Can we merge with previous entry? */
 912                 if (prev &&
 913                     i->btree_id == prev->btree_id &&
 914                     i->level    == prev->level &&
 915                     i->type     == prev->type &&
 916                     i->type     == BCH_JSET_ENTRY_btree_keys &&
 917                     le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
 918                         memmove_u64s_down(vstruct_next(prev),
 919                                           i->_data,
 920                                           u64s);
 921                         le16_add_cpu(&prev->u64s, u64s);
 922                         continue;
 923                 }
 924
 925                 /* Couldn't merge, move i into new position (after prev): */
 926                 prev = prev ? vstruct_next(prev) : jset->start;
 927                 if (i != prev)
 928                         memmove_u64s_down(prev, i, jset_u64s(u64s));
 929         }
 930
 931         prev = prev ? vstruct_next(prev) : jset->start;
 932         jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
 933 }
 934
 935 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 936 {
 937         /* we aren't holding j->lock: */
 938         unsigned new_size = READ_ONCE(j->buf_size_want);
 939         void *new_buf;
 940
 941         if (buf->buf_size >= new_size)
 942                 return;
 943
 944         new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
 945         if (!new_buf)
 946                 return;
 947
 948         memcpy(new_buf, buf->data, buf->buf_size);
 949         kvpfree(buf->data, buf->buf_size);
 950         buf->data       = new_buf;
 951         buf->buf_size   = new_size;
 952 }
 953
 954 static void journal_write_done(struct closure *cl)
 955 {
 956         struct journal *j = container_of(cl, struct journal, io);
 957         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 958         struct journal_buf *w = journal_prev_buf(j);
 959         struct bch_devs_list devs =
 960                 bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 961         struct bch_replicas_padded replicas;
 962         u64 seq = le64_to_cpu(w->data->seq);
 963         u64 last_seq = le64_to_cpu(w->data->last_seq);
 964         int err = 0;
 965
 966         bch2_time_stats_update(j->write_time, j->write_start_time);
 967
 968         if (!devs.nr) {
 969                 bch_err(c, "unable to write journal to sufficient devices");
 970                 err = -EIO;
 971         } else {
 972                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
 973                 if (bch2_mark_replicas(c, &replicas.e))
 974                         err = -EIO;
 975         }
 976
 977         if (err)
 978                 bch2_fatal_error(c);
 979
 980         spin_lock(&j->lock);
 981         if (seq >= j->pin.front)
 982                 journal_seq_pin(j, seq)->devs = devs;
 983
 984         j->seq_ondisk           = seq;
 985         if (err && (!j->err_seq || seq < j->err_seq))
 986                 j->err_seq      = seq;
 987         j->last_seq_ondisk      = last_seq;
 988         bch2_journal_space_available(j);
 989
 990         /*
 991          * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
 992          * more buckets:
 993          *
 994          * Must come before signaling write completion, for
 995          * bch2_fs_journal_stop():
 996          */
 997         mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 998
 999         /* also must come before signalling write completion: */
1000         closure_debug_destroy(cl);
1001
1002         BUG_ON(!j->reservations.prev_buf_unwritten);
1003         atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
1004                      &j->reservations.counter);
1005
1006         closure_wake_up(&w->wait);
1007         journal_wake(j);
1008
1009         if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
1010                 mod_delayed_work(system_freezable_wq, &j->write_work, 0);
1011         spin_unlock(&j->lock);
1012 }
1013
1014 static void journal_write_endio(struct bio *bio)
1015 {
1016         struct bch_dev *ca = bio->bi_private;
1017         struct journal *j = &ca->fs->journal;
1018
1019         if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s",
1020                                bch2_blk_status_to_str(bio->bi_status)) ||
1021             bch2_meta_write_fault("journal")) {
1022                 struct journal_buf *w = journal_prev_buf(j);
1023                 unsigned long flags;
1024
1025                 spin_lock_irqsave(&j->err_lock, flags);
1026                 bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
1027                 spin_unlock_irqrestore(&j->err_lock, flags);
1028         }
1029
1030         closure_put(&j->io);
1031         percpu_ref_put(&ca->io_ref);
1032 }
1033
1034 void bch2_journal_write(struct closure *cl)
1035 {
1036         struct journal *j = container_of(cl, struct journal, io);
1037         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1038         struct bch_dev *ca;
1039         struct journal_buf *w = journal_prev_buf(j);
1040         struct jset_entry *start, *end;
1041         struct jset *jset;
1042         struct bio *bio;
1043         struct bch_extent_ptr *ptr;
1044         bool validate_before_checksum = false;
1045         unsigned i, sectors, bytes, u64s;
1046         int ret;
1047
1048         bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
1049
1050         journal_buf_realloc(j, w);
1051         jset = w->data;
1052
1053         j->write_start_time = local_clock();
1054
1055         /*
1056          * New btree roots are set by journalling them; when the journal entry
1057          * gets written we have to propagate them to c->btree_roots
1058          *
1059          * But, every journal entry we write has to contain all the btree roots
1060          * (at least for now); so after we copy btree roots to c->btree_roots we
1061          * have to get any missing btree roots and add them to this journal
1062          * entry:
1063          */
1064
1065         bch2_journal_entries_to_btree_roots(c, jset);
1066
1067         start = end = vstruct_last(jset);
1068
1069         end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
1070
1071         end     = bch2_journal_super_entries_add_common(c, end,
1072                                                 le64_to_cpu(jset->seq));
1073         u64s    = (u64 *) end - (u64 *) start;
1074         BUG_ON(u64s > j->entry_u64s_reserved);
1075
1076         le32_add_cpu(&jset->u64s, u64s);
1077         BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
1078
1079         journal_write_compact(jset);
1080
1081         jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
1082         jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
1083         jset->magic             = cpu_to_le64(jset_magic(c));
1084
1085         jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
1086                 ? cpu_to_le32(BCH_JSET_VERSION_OLD)
1087                 : cpu_to_le32(c->sb.version);
1088
1089         SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
1090         SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
1091
1092         if (journal_entry_empty(jset))
1093                 j->last_empty_seq = le64_to_cpu(jset->seq);
1094
1095         if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
1096                 validate_before_checksum = true;
1097
1098         if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
1099                 validate_before_checksum = true;
1100
1101         if (validate_before_checksum &&
1102             jset_validate_entries(c, jset, WRITE))
1103                 goto err;
1104
1105         bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
1106                     jset->encrypted_start,
1107                     vstruct_end(jset) - (void *) jset->encrypted_start);
1108
1109         jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
1110                                   journal_nonce(jset), jset);
1111
1112         if (!validate_before_checksum &&
1113             jset_validate_entries(c, jset, WRITE))
1114                 goto err;
1115
1116         sectors = vstruct_sectors(jset, c->block_bits);
1117         BUG_ON(sectors > w->sectors);
1118
1119         bytes = vstruct_bytes(jset);
1120         memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
1121
1122 retry_alloc:
1123         spin_lock(&j->lock);
1124         ret = journal_write_alloc(j, w, sectors);
1125
1126         if (ret && j->can_discard) {
1127                 spin_unlock(&j->lock);
1128                 bch2_journal_do_discards(j);
1129                 goto retry_alloc;
1130         }
1131
1132         /*
1133          * write is allocated, no longer need to account for it in
1134          * bch2_journal_space_available():
1135          */
1136         w->sectors = 0;
1137
1138         /*
1139          * journal entry has been compacted and allocated, recalculate space
1140          * available:
1141          */
1142         bch2_journal_space_available(j);
1143         spin_unlock(&j->lock);
1144
1145         if (ret) {
1146                 bch_err(c, "Unable to allocate journal write");
1147                 bch2_fatal_error(c);
1148                 continue_at(cl, journal_write_done, system_highpri_wq);
1149                 return;
1150         }
1151
1152         /*
1153          * XXX: we really should just disable the entire journal in nochanges
1154          * mode
1155          */
1156         if (c->opts.nochanges)
1157                 goto no_io;
1158
1159         extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
1160                 ca = bch_dev_bkey_exists(c, ptr->dev);
1161                 if (!percpu_ref_tryget(&ca->io_ref)) {
1162                         /* XXX: fix this */
1163                         bch_err(c, "missing device for journal write\n");
1164                         continue;
1165                 }
1166
1167                 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
1168                              sectors);
1169
1170                 bio = ca->journal.bio;
1171                 bio_reset(bio);
1172                 bio_set_dev(bio, ca->disk_sb.bdev);
1173                 bio->bi_iter.bi_sector  = ptr->offset;
1174                 bio->bi_end_io          = journal_write_endio;
1175                 bio->bi_private         = ca;
1176                 bio_set_op_attrs(bio, REQ_OP_WRITE,
1177                                  REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
1178                 bch2_bio_map(bio, jset, sectors << 9);
1179
1180                 trace_journal_write(bio);
1181                 closure_bio_submit(bio, cl);
1182
1183                 ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
1184         }
1185
1186         for_each_rw_member(ca, c, i)
1187                 if (journal_flushes_device(ca) &&
1188                     !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
1189                         percpu_ref_get(&ca->io_ref);
1190
1191                         bio = ca->journal.bio;
1192                         bio_reset(bio);
1193                         bio_set_dev(bio, ca->disk_sb.bdev);
1194                         bio->bi_opf             = REQ_OP_FLUSH;
1195                         bio->bi_end_io          = journal_write_endio;
1196                         bio->bi_private         = ca;
1197                         closure_bio_submit(bio, cl);
1198                 }
1199
1200 no_io:
1201         bch2_bucket_seq_cleanup(c);
1202
1203         continue_at(cl, journal_write_done, system_highpri_wq);
1204         return;
1205 err:
1206         bch2_inconsistent_error(c);
1207         continue_at(cl, journal_write_done, system_highpri_wq);
1208 }