]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/recovery.c
Update bcachefs sources to 75e8a078b8 bcachefs: improved flush_held_btree_writes()
[bcachefs-tools-debian] / libbcachefs / recovery.c
1
2 #include "bcachefs.h"
3 #include "alloc_background.h"
4 #include "btree_gc.h"
5 #include "btree_update.h"
6 #include "btree_update_interior.h"
7 #include "btree_io.h"
8 #include "buckets.h"
9 #include "dirent.h"
10 #include "ec.h"
11 #include "error.h"
12 #include "fsck.h"
13 #include "journal_io.h"
14 #include "quota.h"
15 #include "recovery.h"
16 #include "replicas.h"
17 #include "super-io.h"
18
19 #include <linux/stat.h>
20
21 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
22
23 static struct bkey_i *btree_root_find(struct bch_fs *c,
24                                       struct bch_sb_field_clean *clean,
25                                       struct jset *j,
26                                       enum btree_id id, unsigned *level)
27 {
28         struct bkey_i *k;
29         struct jset_entry *entry, *start, *end;
30
31         if (clean) {
32                 start = clean->start;
33                 end = vstruct_end(&clean->field);
34         } else {
35                 start = j->start;
36                 end = vstruct_last(j);
37         }
38
39         for (entry = start; entry < end; entry = vstruct_next(entry))
40                 if (entry->type == BCH_JSET_ENTRY_btree_root &&
41                     entry->btree_id == id)
42                         goto found;
43
44         return NULL;
45 found:
46         if (!entry->u64s)
47                 return ERR_PTR(-EINVAL);
48
49         k = entry->start;
50         *level = entry->level;
51         return k;
52 }
53
54 static int journal_replay_entry_early(struct bch_fs *c,
55                                       struct jset_entry *entry)
56 {
57         int ret = 0;
58
59         switch (entry->type) {
60         case BCH_JSET_ENTRY_btree_root: {
61                 struct btree_root *r = &c->btree_roots[entry->btree_id];
62
63                 if (entry->u64s) {
64                         r->level = entry->level;
65                         bkey_copy(&r->key, &entry->start[0]);
66                         r->error = 0;
67                 } else {
68                         r->error = -EIO;
69                 }
70                 r->alive = true;
71                 break;
72         }
73         case BCH_JSET_ENTRY_usage: {
74                 struct jset_entry_usage *u =
75                         container_of(entry, struct jset_entry_usage, entry);
76
77                 switch (entry->btree_id) {
78                 case FS_USAGE_RESERVED:
79                         if (entry->level < BCH_REPLICAS_MAX)
80                                 percpu_u64_set(&c->usage[0]->
81                                                persistent_reserved[entry->level],
82                                                le64_to_cpu(u->v));
83                         break;
84                 case FS_USAGE_INODES:
85                         percpu_u64_set(&c->usage[0]->nr_inodes,
86                                        le64_to_cpu(u->v));
87                         break;
88                 case FS_USAGE_KEY_VERSION:
89                         atomic64_set(&c->key_version,
90                                      le64_to_cpu(u->v));
91                         break;
92                 }
93
94                 break;
95         }
96         case BCH_JSET_ENTRY_data_usage: {
97                 struct jset_entry_data_usage *u =
98                         container_of(entry, struct jset_entry_data_usage, entry);
99                 ret = bch2_replicas_set_usage(c, &u->r,
100                                               le64_to_cpu(u->v));
101                 break;
102         }
103         }
104
105         return ret;
106 }
107
108 static int verify_superblock_clean(struct bch_fs *c,
109                                    struct bch_sb_field_clean *clean,
110                                    struct jset *j)
111 {
112         unsigned i;
113         int ret = 0;
114
115         if (!clean || !j)
116                 return 0;
117
118         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
119                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
120                         le64_to_cpu(clean->journal_seq),
121                         le64_to_cpu(j->seq)))
122                 bch2_fs_mark_clean(c, false);
123
124         mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
125                         "superblock read clock doesn't match journal after clean shutdown");
126         mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
127                         "superblock read clock doesn't match journal after clean shutdown");
128
129         for (i = 0; i < BTREE_ID_NR; i++) {
130                 struct bkey_i *k1, *k2;
131                 unsigned l1 = 0, l2 = 0;
132
133                 k1 = btree_root_find(c, clean, NULL, i, &l1);
134                 k2 = btree_root_find(c, NULL, j, i, &l2);
135
136                 if (!k1 && !k2)
137                         continue;
138
139                 mustfix_fsck_err_on(!k1 || !k2 ||
140                                     IS_ERR(k1) ||
141                                     IS_ERR(k2) ||
142                                     k1->k.u64s != k2->k.u64s ||
143                                     memcmp(k1, k2, bkey_bytes(k1)) ||
144                                     l1 != l2, c,
145                         "superblock btree root doesn't match journal after clean shutdown");
146         }
147 fsck_err:
148         return ret;
149 }
150
151 static bool journal_empty(struct list_head *journal)
152 {
153         struct journal_replay *i;
154         struct jset_entry *entry;
155
156         if (list_empty(journal))
157                 return true;
158
159         i = list_last_entry(journal, struct journal_replay, list);
160
161         if (i->j.last_seq != i->j.seq)
162                 return false;
163
164         list_for_each_entry(i, journal, list) {
165                 vstruct_for_each(&i->j, entry) {
166                         if (entry->type == BCH_JSET_ENTRY_btree_root ||
167                             entry->type == BCH_JSET_ENTRY_usage ||
168                             entry->type == BCH_JSET_ENTRY_data_usage)
169                                 continue;
170
171                         if (entry->type == BCH_JSET_ENTRY_btree_keys &&
172                             !entry->u64s)
173                                 continue;
174                         return false;
175                 }
176         }
177
178         return true;
179 }
180
181 int bch2_fs_recovery(struct bch_fs *c)
182 {
183         const char *err = "cannot allocate memory";
184         struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
185         struct jset_entry *entry;
186         LIST_HEAD(journal);
187         struct jset *j = NULL;
188         unsigned i;
189         int ret;
190
191         mutex_lock(&c->sb_lock);
192         if (!c->replicas.entries) {
193                 bch_info(c, "building replicas info");
194                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
195         }
196
197         if (c->sb.clean)
198                 sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
199         if (sb_clean) {
200                 clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
201                                 GFP_KERNEL);
202                 if (!clean) {
203                         ret = -ENOMEM;
204                         mutex_unlock(&c->sb_lock);
205                         goto err;
206                 }
207
208                 if (le16_to_cpu(c->disk_sb.sb->version) <
209                     bcachefs_metadata_version_bkey_renumber)
210                         bch2_sb_clean_renumber(clean, READ);
211         }
212         mutex_unlock(&c->sb_lock);
213
214         if (clean)
215                 bch_info(c, "recovering from clean shutdown, journal seq %llu",
216                          le64_to_cpu(clean->journal_seq));
217
218         if (!clean || c->opts.fsck) {
219                 ret = bch2_journal_read(c, &journal);
220                 if (ret)
221                         goto err;
222
223                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
224         } else {
225                 ret = bch2_journal_set_seq(c,
226                                            le64_to_cpu(clean->journal_seq),
227                                            le64_to_cpu(clean->journal_seq));
228                 BUG_ON(ret);
229         }
230
231         ret = verify_superblock_clean(c, clean, j);
232         if (ret)
233                 goto err;
234
235         fsck_err_on(clean && !journal_empty(&journal), c,
236                     "filesystem marked clean but journal not empty");
237
238         err = "insufficient memory";
239         if (clean) {
240                 c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
241                 c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
242
243                 for (entry = clean->start;
244                      entry != vstruct_end(&clean->field);
245                      entry = vstruct_next(entry)) {
246                         ret = journal_replay_entry_early(c, entry);
247                         if (ret)
248                                 goto err;
249                 }
250         } else {
251                 struct journal_replay *i;
252
253                 c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
254                 c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
255
256                 list_for_each_entry(i, &journal, list)
257                         vstruct_for_each(&i->j, entry) {
258                                 ret = journal_replay_entry_early(c, entry);
259                                 if (ret)
260                                         goto err;
261                         }
262         }
263
264         bch2_fs_usage_initialize(c);
265
266         for (i = 0; i < BTREE_ID_NR; i++) {
267                 struct btree_root *r = &c->btree_roots[i];
268
269                 if (!r->alive)
270                         continue;
271
272                 err = "invalid btree root pointer";
273                 if (r->error)
274                         goto err;
275
276                 err = "error reading btree root";
277                 if (bch2_btree_root_read(c, i, &r->key, r->level)) {
278                         if (i != BTREE_ID_ALLOC)
279                                 goto err;
280
281                         mustfix_fsck_err(c, "error reading btree root");
282                 }
283         }
284
285         for (i = 0; i < BTREE_ID_NR; i++)
286                 if (!c->btree_roots[i].b)
287                         bch2_btree_root_alloc(c, i);
288
289         err = "error reading allocation information";
290         ret = bch2_alloc_read(c, &journal);
291         if (ret)
292                 goto err;
293
294         bch_verbose(c, "starting stripes_read");
295         ret = bch2_stripes_read(c, &journal);
296         if (ret)
297                 goto err;
298         bch_verbose(c, "stripes_read done");
299
300         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
301
302         if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
303             c->opts.fsck) {
304                 bch_verbose(c, "starting mark and sweep:");
305                 err = "error in recovery";
306                 ret = bch2_gc(c, &journal, true);
307                 if (ret)
308                         goto err;
309                 bch_verbose(c, "mark and sweep done");
310         }
311
312         clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
313         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
314
315         /*
316          * Skip past versions that might have possibly been used (as nonces),
317          * but hadn't had their pointers written:
318          */
319         if (c->sb.encryption_type && !c->sb.clean)
320                 atomic64_add(1 << 16, &c->key_version);
321
322         if (c->opts.noreplay)
323                 goto out;
324
325         /*
326          * Mark dirty before journal replay, fsck:
327          * XXX: after a clean shutdown, this could be done lazily only when fsck
328          * finds an error
329          */
330         bch2_fs_mark_clean(c, false);
331
332         /*
333          * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
334          * will give spurious errors about oldest_gen > bucket_gen -
335          * this is a hack but oh well.
336          */
337         bch2_fs_journal_start(&c->journal);
338
339         err = "error starting allocator";
340         ret = bch2_fs_allocator_start(c);
341         if (ret)
342                 goto err;
343
344         bch_verbose(c, "starting journal replay:");
345         err = "journal replay failed";
346         ret = bch2_journal_replay(c, &journal);
347         if (ret)
348                 goto err;
349         bch_verbose(c, "journal replay done");
350
351         if (c->opts.norecovery)
352                 goto out;
353
354         err = "error in fsck";
355         ret = bch2_fsck(c);
356         if (ret)
357                 goto err;
358
359         mutex_lock(&c->sb_lock);
360         if (c->opts.version_upgrade) {
361                 if (c->sb.version < bcachefs_metadata_version_new_versioning)
362                         c->disk_sb.sb->version_min =
363                                 le16_to_cpu(bcachefs_metadata_version_min);
364                 c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
365         }
366
367         if (!test_bit(BCH_FS_FSCK_UNFIXED_ERRORS, &c->flags))
368                 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
369         mutex_unlock(&c->sb_lock);
370
371         if (enabled_qtypes(c)) {
372                 bch_verbose(c, "reading quotas:");
373                 ret = bch2_fs_quota_read(c);
374                 if (ret)
375                         goto err;
376                 bch_verbose(c, "quotas done");
377         }
378
379 out:
380         bch2_journal_entries_free(&journal);
381         kfree(clean);
382         return ret;
383 err:
384 fsck_err:
385         pr_err("Error in recovery: %s (%i)", err, ret);
386         goto out;
387 }
388
389 int bch2_fs_initialize(struct bch_fs *c)
390 {
391         struct bch_inode_unpacked root_inode, lostfound_inode;
392         struct bkey_inode_buf packed_inode;
393         struct bch_hash_info root_hash_info;
394         struct qstr lostfound = QSTR("lost+found");
395         const char *err = "cannot allocate memory";
396         struct bch_dev *ca;
397         LIST_HEAD(journal);
398         unsigned i;
399         int ret;
400
401         bch_notice(c, "initializing new filesystem");
402
403         mutex_lock(&c->sb_lock);
404         for_each_online_member(ca, c, i)
405                 bch2_mark_dev_superblock(c, ca, 0);
406         mutex_unlock(&c->sb_lock);
407
408         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
409         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
410
411         for (i = 0; i < BTREE_ID_NR; i++)
412                 bch2_btree_root_alloc(c, i);
413
414         err = "unable to allocate journal buckets";
415         for_each_online_member(ca, c, i) {
416                 ret = bch2_dev_journal_alloc(ca);
417                 if (ret) {
418                         percpu_ref_put(&ca->io_ref);
419                         goto err;
420                 }
421         }
422
423         /*
424          * journal_res_get() will crash if called before this has
425          * set up the journal.pin FIFO and journal.cur pointer:
426          */
427         bch2_fs_journal_start(&c->journal);
428         bch2_journal_set_replay_done(&c->journal);
429
430         err = "error starting allocator";
431         ret = bch2_fs_allocator_start(c);
432         if (ret)
433                 goto err;
434
435         bch2_inode_init(c, &root_inode, 0, 0,
436                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
437         root_inode.bi_inum = BCACHEFS_ROOT_INO;
438         root_inode.bi_nlink++; /* lost+found */
439         bch2_inode_pack(&packed_inode, &root_inode);
440
441         err = "error creating root directory";
442         ret = bch2_btree_insert(c, BTREE_ID_INODES,
443                                 &packed_inode.inode.k_i,
444                                 NULL, NULL, 0);
445         if (ret)
446                 goto err;
447
448         bch2_inode_init(c, &lostfound_inode, 0, 0,
449                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
450                         &root_inode);
451         lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
452         bch2_inode_pack(&packed_inode, &lostfound_inode);
453
454         err = "error creating lost+found";
455         ret = bch2_btree_insert(c, BTREE_ID_INODES,
456                                 &packed_inode.inode.k_i,
457                                 NULL, NULL, 0);
458         if (ret)
459                 goto err;
460
461         root_hash_info = bch2_hash_info_init(c, &root_inode);
462
463         ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
464                                  &lostfound, lostfound_inode.bi_inum, NULL,
465                                  BTREE_INSERT_NOFAIL);
466         if (ret)
467                 goto err;
468
469         if (enabled_qtypes(c)) {
470                 ret = bch2_fs_quota_read(c);
471                 if (ret)
472                         goto err;
473         }
474
475         err = "error writing first journal entry";
476         ret = bch2_journal_meta(&c->journal);
477         if (ret)
478                 goto err;
479
480         mutex_lock(&c->sb_lock);
481         c->disk_sb.sb->version = c->disk_sb.sb->version_min =
482                 le16_to_cpu(bcachefs_metadata_version_current);
483         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
484
485         SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
486         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
487
488         bch2_write_super(c);
489         mutex_unlock(&c->sb_lock);
490
491         return 0;
492 err:
493         pr_err("Error initializing new filesystem: %s (%i)", err, ret);
494         return ret;
495 }