git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/recovery.c

   1
   2 #include "bcachefs.h"
   3 #include "alloc.h"
   4 #include "btree_gc.h"
   5 #include "btree_update.h"
   6 #include "btree_update_interior.h"
   7 #include "btree_io.h"
   8 #include "dirent.h"
   9 #include "error.h"
  10 #include "fsck.h"
  11 #include "journal_io.h"
  12 #include "quota.h"
  13 #include "recovery.h"
  14 #include "super-io.h"
  15
  16 #include <linux/stat.h>
  17
  18 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
  19
  20 struct bkey_i *btree_root_find(struct bch_fs *c,
  21                                struct bch_sb_field_clean *clean,
  22                                struct jset *j,
  23                                enum btree_id id, unsigned *level)
  24 {
  25         struct bkey_i *k;
  26         struct jset_entry *entry, *start, *end;
  27
  28         if (clean) {
  29                 start = clean->start;
  30                 end = vstruct_end(&clean->field);
  31         } else {
  32                 start = j->start;
  33                 end = vstruct_last(j);
  34         }
  35
  36         for (entry = start; entry < end; entry = vstruct_next(entry))
  37                 if (entry->type == BCH_JSET_ENTRY_btree_root &&
  38                     entry->btree_id == id)
  39                         goto found;
  40
  41         return NULL;
  42 found:
  43         if (!entry->u64s)
  44                 return ERR_PTR(-EINVAL);
  45
  46         k = entry->start;
  47         *level = entry->level;
  48         return k;
  49 }
  50
  51 static int verify_superblock_clean(struct bch_fs *c,
  52                                    struct bch_sb_field_clean *clean,
  53                                    struct jset *j)
  54 {
  55         unsigned i;
  56         int ret = 0;
  57
  58         if (!clean || !j)
  59                 return 0;
  60
  61         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
  62                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
  63                         le64_to_cpu(clean->journal_seq),
  64                         le64_to_cpu(j->seq)))
  65                 bch2_fs_mark_clean(c, false);
  66
  67         mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
  68                         "superblock read clock doesn't match journal after clean shutdown");
  69         mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
  70                         "superblock read clock doesn't match journal after clean shutdown");
  71
  72         for (i = 0; i < BTREE_ID_NR; i++) {
  73                 struct bkey_i *k1, *k2;
  74                 unsigned l1 = 0, l2 = 0;
  75
  76                 k1 = btree_root_find(c, clean, NULL, i, &l1);
  77                 k2 = btree_root_find(c, NULL, j, i, &l2);
  78
  79                 if (!k1 && !k2)
  80                         continue;
  81
  82                 mustfix_fsck_err_on(!k1 || !k2 ||
  83                                     IS_ERR(k1) ||
  84                                     IS_ERR(k2) ||
  85                                     k1->k.u64s != k2->k.u64s ||
  86                                     memcmp(k1, k2, bkey_bytes(k1)) ||
  87                                     l1 != l2, c,
  88                         "superblock btree root doesn't match journal after clean shutdown");
  89         }
  90 fsck_err:
  91         return ret;
  92 }
  93
  94 static bool journal_empty(struct list_head *journal)
  95 {
  96         struct journal_replay *i;
  97         struct jset_entry *entry;
  98
  99         if (list_empty(journal))
 100                 return true;
 101
 102         i = list_last_entry(journal, struct journal_replay, list);
 103
 104         if (i->j.last_seq != i->j.seq)
 105                 return false;
 106
 107         list_for_each_entry(i, journal, list) {
 108                 vstruct_for_each(&i->j, entry) {
 109                         if (entry->type == BCH_JSET_ENTRY_btree_root)
 110                                 continue;
 111
 112                         if (entry->type == BCH_JSET_ENTRY_btree_keys &&
 113                             !entry->u64s)
 114                                 continue;
 115                         return false;
 116                 }
 117         }
 118
 119         return true;
 120 }
 121
 122 int bch2_fs_recovery(struct bch_fs *c)
 123 {
 124         const char *err = "cannot allocate memory";
 125         struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
 126         LIST_HEAD(journal);
 127         struct jset *j = NULL;
 128         unsigned i;
 129         int ret;
 130
 131         mutex_lock(&c->sb_lock);
 132         if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
 133                 bch_info(c, "building replicas info");
 134                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 135         }
 136
 137         if (c->sb.clean)
 138                 sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
 139         if (sb_clean) {
 140                 clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
 141                                 GFP_KERNEL);
 142                 if (!clean) {
 143                         ret = -ENOMEM;
 144                         mutex_unlock(&c->sb_lock);
 145                         goto err;
 146                 }
 147         }
 148         mutex_unlock(&c->sb_lock);
 149
 150         if (clean)
 151                 bch_info(c, "recovering from clean shutdown, journal seq %llu",
 152                          le64_to_cpu(clean->journal_seq));
 153
 154         if (!clean || !c->opts.nofsck) {
 155                 ret = bch2_journal_read(c, &journal);
 156                 if (ret)
 157                         goto err;
 158
 159                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
 160         } else {
 161                 ret = bch2_journal_set_seq(c,
 162                                            le64_to_cpu(clean->journal_seq),
 163                                            le64_to_cpu(clean->journal_seq));
 164                 BUG_ON(ret);
 165         }
 166
 167         ret = verify_superblock_clean(c, clean, j);
 168         if (ret)
 169                 goto err;
 170
 171         fsck_err_on(clean && !journal_empty(&journal), c,
 172                     "filesystem marked clean but journal not empty");
 173
 174         if (clean) {
 175                 c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
 176                 c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
 177         } else {
 178                 c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
 179                 c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
 180         }
 181
 182         for (i = 0; i < BTREE_ID_NR; i++) {
 183                 unsigned level;
 184                 struct bkey_i *k;
 185
 186                 k = btree_root_find(c, clean, j, i, &level);
 187                 if (!k)
 188                         continue;
 189
 190                 err = "invalid btree root pointer";
 191                 if (IS_ERR(k))
 192                         goto err;
 193
 194                 err = "error reading btree root";
 195                 if (bch2_btree_root_read(c, i, k, level)) {
 196                         if (i != BTREE_ID_ALLOC)
 197                                 goto err;
 198
 199                         mustfix_fsck_err(c, "error reading btree root");
 200                 }
 201         }
 202
 203         for (i = 0; i < BTREE_ID_NR; i++)
 204                 if (!c->btree_roots[i].b)
 205                         bch2_btree_root_alloc(c, i);
 206
 207         err = "error reading allocation information";
 208         ret = bch2_alloc_read(c, &journal);
 209         if (ret)
 210                 goto err;
 211
 212         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 213
 214         bch_verbose(c, "starting mark and sweep:");
 215         err = "error in recovery";
 216         ret = bch2_initial_gc(c, &journal);
 217         if (ret)
 218                 goto err;
 219         bch_verbose(c, "mark and sweep done");
 220
 221         if (c->opts.noreplay)
 222                 goto out;
 223
 224         /*
 225          * Mark dirty before journal replay, fsck:
 226          * XXX: after a clean shutdown, this could be done lazily only when fsck
 227          * finds an error
 228          */
 229         bch2_fs_mark_clean(c, false);
 230
 231         /*
 232          * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
 233          * will give spurious errors about oldest_gen > bucket_gen -
 234          * this is a hack but oh well.
 235          */
 236         bch2_fs_journal_start(&c->journal);
 237
 238         err = "error starting allocator";
 239         ret = bch2_fs_allocator_start(c);
 240         if (ret)
 241                 goto err;
 242
 243         bch_verbose(c, "starting journal replay:");
 244         err = "journal replay failed";
 245         ret = bch2_journal_replay(c, &journal);
 246         if (ret)
 247                 goto err;
 248         bch_verbose(c, "journal replay done");
 249
 250         if (c->opts.norecovery)
 251                 goto out;
 252
 253         err = "error in fsck";
 254         ret = bch2_fsck(c);
 255         if (ret)
 256                 goto err;
 257
 258         if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags)) {
 259                 mutex_lock(&c->sb_lock);
 260                 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 261                 mutex_unlock(&c->sb_lock);
 262         }
 263
 264         if (enabled_qtypes(c)) {
 265                 bch_verbose(c, "reading quotas:");
 266                 ret = bch2_fs_quota_read(c);
 267                 if (ret)
 268                         goto err;
 269                 bch_verbose(c, "quotas done");
 270         }
 271
 272 out:
 273         bch2_journal_entries_free(&journal);
 274         kfree(clean);
 275         return ret;
 276 err:
 277 fsck_err:
 278         BUG_ON(!ret);
 279         goto out;
 280 }
 281
 282 int bch2_fs_initialize(struct bch_fs *c)
 283 {
 284         struct bch_inode_unpacked root_inode, lostfound_inode;
 285         struct bkey_inode_buf packed_inode;
 286         struct bch_hash_info root_hash_info;
 287         struct qstr lostfound = QSTR("lost+found");
 288         const char *err = "cannot allocate memory";
 289         struct bch_dev *ca;
 290         LIST_HEAD(journal);
 291         unsigned i;
 292         int ret;
 293
 294         bch_notice(c, "initializing new filesystem");
 295
 296         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 297
 298         ret = bch2_initial_gc(c, &journal);
 299         if (ret)
 300                 goto err;
 301
 302         err = "unable to allocate journal buckets";
 303         for_each_online_member(ca, c, i)
 304                 if (bch2_dev_journal_alloc(ca)) {
 305                         percpu_ref_put(&ca->io_ref);
 306                         goto err;
 307                 }
 308
 309         for (i = 0; i < BTREE_ID_NR; i++)
 310                 bch2_btree_root_alloc(c, i);
 311
 312         /*
 313          * journal_res_get() will crash if called before this has
 314          * set up the journal.pin FIFO and journal.cur pointer:
 315          */
 316         bch2_fs_journal_start(&c->journal);
 317         bch2_journal_set_replay_done(&c->journal);
 318
 319         err = "error starting allocator";
 320         ret = bch2_fs_allocator_start(c);
 321         if (ret)
 322                 goto err;
 323
 324         bch2_inode_init(c, &root_inode, 0, 0,
 325                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
 326         root_inode.bi_inum = BCACHEFS_ROOT_INO;
 327         root_inode.bi_nlink++; /* lost+found */
 328         bch2_inode_pack(&packed_inode, &root_inode);
 329
 330         err = "error creating root directory";
 331         ret = bch2_btree_insert(c, BTREE_ID_INODES,
 332                                 &packed_inode.inode.k_i,
 333                                 NULL, NULL, 0);
 334         if (ret)
 335                 goto err;
 336
 337         bch2_inode_init(c, &lostfound_inode, 0, 0,
 338                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
 339                         &root_inode);
 340         lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
 341         bch2_inode_pack(&packed_inode, &lostfound_inode);
 342
 343         err = "error creating lost+found";
 344         ret = bch2_btree_insert(c, BTREE_ID_INODES,
 345                                 &packed_inode.inode.k_i,
 346                                 NULL, NULL, 0);
 347         if (ret)
 348                 goto err;
 349
 350         root_hash_info = bch2_hash_info_init(c, &root_inode);
 351
 352         ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
 353                                  &lostfound, lostfound_inode.bi_inum, NULL,
 354                                  BTREE_INSERT_NOFAIL);
 355         if (ret)
 356                 goto err;
 357
 358         atomic_long_set(&c->nr_inodes, 2);
 359
 360         if (enabled_qtypes(c)) {
 361                 ret = bch2_fs_quota_read(c);
 362                 if (ret)
 363                         goto err;
 364         }
 365
 366         err = "error writing first journal entry";
 367         ret = bch2_journal_meta(&c->journal);
 368         if (ret)
 369                 goto err;
 370
 371         mutex_lock(&c->sb_lock);
 372         SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
 373         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 374         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
 375
 376         bch2_write_super(c);
 377         mutex_unlock(&c->sb_lock);
 378
 379         return 0;
 380 err:
 381         BUG_ON(!ret);
 382         return ret;
 383 }