]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/recovery.c
e0df2c0a4fdf54b2747b570833c6b476e325de45
[bcachefs-tools-debian] / libbcachefs / recovery.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_background.h"
5 #include "btree_gc.h"
6 #include "btree_update.h"
7 #include "btree_update_interior.h"
8 #include "btree_io.h"
9 #include "buckets.h"
10 #include "dirent.h"
11 #include "ec.h"
12 #include "error.h"
13 #include "fsck.h"
14 #include "journal_io.h"
15 #include "journal_reclaim.h"
16 #include "journal_seq_blacklist.h"
17 #include "quota.h"
18 #include "recovery.h"
19 #include "replicas.h"
20 #include "super-io.h"
21
22 #include <linux/sort.h>
23 #include <linux/stat.h>
24
25 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
26
27 /* sort and dedup all keys in the journal: */
28
29 static void journal_entries_free(struct list_head *list)
30 {
31
32         while (!list_empty(list)) {
33                 struct journal_replay *i =
34                         list_first_entry(list, struct journal_replay, list);
35                 list_del(&i->list);
36                 kvpfree(i, offsetof(struct journal_replay, j) +
37                         vstruct_bytes(&i->j));
38         }
39 }
40
41 static int journal_sort_key_cmp(const void *_l, const void *_r)
42 {
43         const struct journal_key *l = _l;
44         const struct journal_key *r = _r;
45
46         return cmp_int(l->btree_id, r->btree_id) ?:
47                 bkey_cmp(l->pos, r->pos) ?:
48                 cmp_int(l->journal_seq, r->journal_seq) ?:
49                 cmp_int(l->journal_offset, r->journal_offset);
50 }
51
52 static int journal_sort_seq_cmp(const void *_l, const void *_r)
53 {
54         const struct journal_key *l = _l;
55         const struct journal_key *r = _r;
56
57         return cmp_int(l->journal_seq, r->journal_seq) ?:
58                 cmp_int(l->btree_id, r->btree_id) ?:
59                 bkey_cmp(l->pos, r->pos);
60 }
61
62 static void journal_keys_sift(struct journal_keys *keys, struct journal_key *i)
63 {
64         while (i + 1 < keys->d + keys->nr &&
65                journal_sort_key_cmp(i, i + 1) > 0) {
66                 swap(i[0], i[1]);
67                 i++;
68         }
69 }
70
71 static void journal_keys_free(struct journal_keys *keys)
72 {
73         struct journal_key *i;
74
75         for_each_journal_key(*keys, i)
76                 if (i->allocated)
77                         kfree(i->k);
78         kvfree(keys->d);
79         keys->d = NULL;
80         keys->nr = 0;
81 }
82
83 static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
84 {
85         struct journal_replay *p;
86         struct jset_entry *entry;
87         struct bkey_i *k, *_n;
88         struct journal_keys keys = { NULL }, keys_deduped = { NULL };
89         struct journal_key *i;
90         size_t nr_keys = 0;
91
92         list_for_each_entry(p, journal_entries, list)
93                 for_each_jset_key(k, _n, entry, &p->j)
94                         nr_keys++;
95
96         keys.journal_seq_base = keys_deduped.journal_seq_base =
97                 le64_to_cpu(list_first_entry(journal_entries,
98                                              struct journal_replay,
99                                              list)->j.seq);
100
101         keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
102         if (!keys.d)
103                 goto err;
104
105         keys_deduped.d = kvmalloc(sizeof(keys.d[0]) * nr_keys * 2, GFP_KERNEL);
106         if (!keys_deduped.d)
107                 goto err;
108
109         list_for_each_entry(p, journal_entries, list)
110                 for_each_jset_key(k, _n, entry, &p->j)
111                         keys.d[keys.nr++] = (struct journal_key) {
112                                 .btree_id       = entry->btree_id,
113                                 .pos            = bkey_start_pos(&k->k),
114                                 .k              = k,
115                                 .journal_seq    = le64_to_cpu(p->j.seq) -
116                                         keys.journal_seq_base,
117                                 .journal_offset = k->_data - p->j._data,
118                         };
119
120         sort(keys.d, nr_keys, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
121
122         i = keys.d;
123         while (i < keys.d + keys.nr) {
124                 if (i + 1 < keys.d + keys.nr &&
125                     i[0].btree_id == i[1].btree_id &&
126                     !bkey_cmp(i[0].pos, i[1].pos)) {
127                         if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
128                                 i++;
129                         } else {
130                                 bch2_cut_front(i[1].k->k.p, i[0].k);
131                                 i[0].pos = i[1].k->k.p;
132                                 journal_keys_sift(&keys, i);
133                         }
134                         continue;
135                 }
136
137                 if (i + 1 < keys.d + keys.nr &&
138                     i[0].btree_id == i[1].btree_id &&
139                     bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k)) > 0) {
140                         if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
141                              cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
142                                 if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
143                                         bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
144                                 } else {
145                                         struct bkey_i *split =
146                                                 kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
147
148                                         if (!split)
149                                                 goto err;
150
151                                         bkey_copy(split, i[0].k);
152                                         bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
153                                         keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
154                                                 .btree_id       = i[0].btree_id,
155                                                 .allocated      = true,
156                                                 .pos            = bkey_start_pos(&split->k),
157                                                 .k              = split,
158                                                 .journal_seq    = i[0].journal_seq,
159                                                 .journal_offset = i[0].journal_offset,
160                                         };
161
162                                         bch2_cut_front(i[1].k->k.p, i[0].k);
163                                         i[0].pos = i[1].k->k.p;
164                                         journal_keys_sift(&keys, i);
165                                         continue;
166                                 }
167                         } else {
168                                 if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) >= 0) {
169                                         i[1] = i[0];
170                                         i++;
171                                         continue;
172                                 } else {
173                                         bch2_cut_front(i[0].k->k.p, i[1].k);
174                                         i[1].pos = i[0].k->k.p;
175                                         journal_keys_sift(&keys, i + 1);
176                                         continue;
177                                 }
178                         }
179                 }
180
181                 keys_deduped.d[keys_deduped.nr++] = *i++;
182         }
183
184         kvfree(keys.d);
185         return keys_deduped;
186 err:
187         journal_keys_free(&keys_deduped);
188         kvfree(keys.d);
189         return (struct journal_keys) { NULL };
190 }
191
192 /* journal replay: */
193
194 static void replay_now_at(struct journal *j, u64 seq)
195 {
196         BUG_ON(seq < j->replay_journal_seq);
197         BUG_ON(seq > j->replay_journal_seq_end);
198
199         while (j->replay_journal_seq < seq)
200                 bch2_journal_pin_put(j, j->replay_journal_seq++);
201 }
202
203 static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
204 {
205         struct btree_trans trans;
206         struct btree_iter *iter, *split_iter;
207         /*
208          * We might cause compressed extents to be split, so we need to pass in
209          * a disk_reservation:
210          */
211         struct disk_reservation disk_res =
212                 bch2_disk_reservation_init(c, 0);
213         struct bkey_i *split;
214         bool split_compressed = false;
215         int ret;
216
217         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
218 retry:
219         bch2_trans_begin(&trans);
220
221         iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
222                                    bkey_start_pos(&k->k),
223                                    BTREE_ITER_INTENT);
224
225         do {
226                 ret = bch2_btree_iter_traverse(iter);
227                 if (ret)
228                         goto err;
229
230                 split_iter = bch2_trans_copy_iter(&trans, iter);
231                 ret = PTR_ERR_OR_ZERO(split_iter);
232                 if (ret)
233                         goto err;
234
235                 split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
236                 ret = PTR_ERR_OR_ZERO(split);
237                 if (ret)
238                         goto err;
239
240                 if (!split_compressed &&
241                     bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
242                     !bch2_extent_is_atomic(k, split_iter)) {
243                         ret = bch2_disk_reservation_add(c, &disk_res,
244                                         k->k.size *
245                                         bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
246                                         BCH_DISK_RESERVATION_NOFAIL);
247                         BUG_ON(ret);
248
249                         split_compressed = true;
250                 }
251
252                 bkey_copy(split, k);
253                 bch2_cut_front(split_iter->pos, split);
254                 bch2_extent_trim_atomic(split, split_iter);
255
256                 bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
257                 bch2_btree_iter_set_pos(iter, split->k.p);
258         } while (bkey_cmp(iter->pos, k->k.p) < 0);
259
260         if (split_compressed) {
261                 ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
262                                           -((s64) k->k.size),
263                                           BCH_BUCKET_MARK_OVERWRITE) ?:
264                       bch2_trans_commit(&trans, &disk_res, NULL,
265                                         BTREE_INSERT_ATOMIC|
266                                         BTREE_INSERT_NOFAIL|
267                                         BTREE_INSERT_LAZY_RW|
268                                         BTREE_INSERT_NOMARK_OVERWRITES|
269                                         BTREE_INSERT_NO_CLEAR_REPLICAS);
270         } else {
271                 ret = bch2_trans_commit(&trans, &disk_res, NULL,
272                                         BTREE_INSERT_ATOMIC|
273                                         BTREE_INSERT_NOFAIL|
274                                         BTREE_INSERT_LAZY_RW|
275                                         BTREE_INSERT_JOURNAL_REPLAY|
276                                         BTREE_INSERT_NOMARK);
277         }
278
279         if (ret)
280                 goto err;
281 err:
282         if (ret == -EINTR)
283                 goto retry;
284
285         bch2_disk_reservation_put(c, &disk_res);
286
287         return bch2_trans_exit(&trans) ?: ret;
288 }
289
290 static int bch2_journal_replay(struct bch_fs *c,
291                                struct journal_keys keys)
292 {
293         struct journal *j = &c->journal;
294         struct journal_key *i;
295         int ret;
296
297         sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
298
299         for_each_journal_key(keys, i) {
300                 replay_now_at(j, keys.journal_seq_base + i->journal_seq);
301
302                 switch (i->btree_id) {
303                 case BTREE_ID_ALLOC:
304                         ret = bch2_alloc_replay_key(c, i->k);
305                         break;
306                 case BTREE_ID_EXTENTS:
307                         ret = bch2_extent_replay_key(c, i->k);
308                         break;
309                 default:
310                         ret = bch2_btree_insert(c, i->btree_id, i->k,
311                                                 NULL, NULL,
312                                                 BTREE_INSERT_NOFAIL|
313                                                 BTREE_INSERT_LAZY_RW|
314                                                 BTREE_INSERT_JOURNAL_REPLAY|
315                                                 BTREE_INSERT_NOMARK);
316                         break;
317                 }
318
319                 if (ret) {
320                         bch_err(c, "journal replay: error %d while replaying key",
321                                 ret);
322                         return ret;
323                 }
324
325                 cond_resched();
326         }
327
328         replay_now_at(j, j->replay_journal_seq_end);
329         j->replay_journal_seq = 0;
330
331         bch2_journal_set_replay_done(j);
332         bch2_journal_flush_all_pins(j);
333         return bch2_journal_error(j);
334 }
335
336 static bool journal_empty(struct list_head *journal)
337 {
338         return list_empty(journal) ||
339                 journal_entry_empty(&list_last_entry(journal,
340                                         struct journal_replay, list)->j);
341 }
342
343 static int
344 verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
345                                                   struct list_head *journal)
346 {
347         struct journal_replay *i =
348                 list_last_entry(journal, struct journal_replay, list);
349         u64 start_seq   = le64_to_cpu(i->j.last_seq);
350         u64 end_seq     = le64_to_cpu(i->j.seq);
351         u64 seq         = start_seq;
352         int ret = 0;
353
354         list_for_each_entry(i, journal, list) {
355                 fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
356                         "journal entries %llu-%llu missing! (replaying %llu-%llu)",
357                         seq, le64_to_cpu(i->j.seq) - 1,
358                         start_seq, end_seq);
359
360                 seq = le64_to_cpu(i->j.seq);
361
362                 fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
363                             "found blacklisted journal entry %llu", seq);
364
365                 do {
366                         seq++;
367                 } while (bch2_journal_seq_is_blacklisted(c, seq, false));
368         }
369 fsck_err:
370         return ret;
371 }
372
373 /* journal replay early: */
374
375 static int journal_replay_entry_early(struct bch_fs *c,
376                                       struct jset_entry *entry)
377 {
378         int ret = 0;
379
380         switch (entry->type) {
381         case BCH_JSET_ENTRY_btree_root: {
382                 struct btree_root *r;
383
384                 if (entry->btree_id >= BTREE_ID_NR) {
385                         bch_err(c, "filesystem has unknown btree type %u",
386                                 entry->btree_id);
387                         return -EINVAL;
388                 }
389
390                 r = &c->btree_roots[entry->btree_id];
391
392                 if (entry->u64s) {
393                         r->level = entry->level;
394                         bkey_copy(&r->key, &entry->start[0]);
395                         r->error = 0;
396                 } else {
397                         r->error = -EIO;
398                 }
399                 r->alive = true;
400                 break;
401         }
402         case BCH_JSET_ENTRY_usage: {
403                 struct jset_entry_usage *u =
404                         container_of(entry, struct jset_entry_usage, entry);
405
406                 switch (entry->btree_id) {
407                 case FS_USAGE_RESERVED:
408                         if (entry->level < BCH_REPLICAS_MAX)
409                                 c->usage_base->persistent_reserved[entry->level] =
410                                         le64_to_cpu(u->v);
411                         break;
412                 case FS_USAGE_INODES:
413                         c->usage_base->nr_inodes = le64_to_cpu(u->v);
414                         break;
415                 case FS_USAGE_KEY_VERSION:
416                         atomic64_set(&c->key_version,
417                                      le64_to_cpu(u->v));
418                         break;
419                 }
420
421                 break;
422         }
423         case BCH_JSET_ENTRY_data_usage: {
424                 struct jset_entry_data_usage *u =
425                         container_of(entry, struct jset_entry_data_usage, entry);
426                 ret = bch2_replicas_set_usage(c, &u->r,
427                                               le64_to_cpu(u->v));
428                 break;
429         }
430         case BCH_JSET_ENTRY_blacklist: {
431                 struct jset_entry_blacklist *bl_entry =
432                         container_of(entry, struct jset_entry_blacklist, entry);
433
434                 ret = bch2_journal_seq_blacklist_add(c,
435                                 le64_to_cpu(bl_entry->seq),
436                                 le64_to_cpu(bl_entry->seq) + 1);
437                 break;
438         }
439         case BCH_JSET_ENTRY_blacklist_v2: {
440                 struct jset_entry_blacklist_v2 *bl_entry =
441                         container_of(entry, struct jset_entry_blacklist_v2, entry);
442
443                 ret = bch2_journal_seq_blacklist_add(c,
444                                 le64_to_cpu(bl_entry->start),
445                                 le64_to_cpu(bl_entry->end) + 1);
446                 break;
447         }
448         }
449
450         return ret;
451 }
452
453 static int journal_replay_early(struct bch_fs *c,
454                                 struct bch_sb_field_clean *clean,
455                                 struct list_head *journal)
456 {
457         struct jset_entry *entry;
458         int ret;
459
460         if (clean) {
461                 c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
462                 c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
463
464                 for (entry = clean->start;
465                      entry != vstruct_end(&clean->field);
466                      entry = vstruct_next(entry)) {
467                         ret = journal_replay_entry_early(c, entry);
468                         if (ret)
469                                 return ret;
470                 }
471         } else {
472                 struct journal_replay *i =
473                         list_last_entry(journal, struct journal_replay, list);
474
475                 c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
476                 c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
477
478                 list_for_each_entry(i, journal, list)
479                         vstruct_for_each(&i->j, entry) {
480                                 ret = journal_replay_entry_early(c, entry);
481                                 if (ret)
482                                         return ret;
483                         }
484         }
485
486         bch2_fs_usage_initialize(c);
487
488         return 0;
489 }
490
491 /* sb clean section: */
492
493 static struct bkey_i *btree_root_find(struct bch_fs *c,
494                                       struct bch_sb_field_clean *clean,
495                                       struct jset *j,
496                                       enum btree_id id, unsigned *level)
497 {
498         struct bkey_i *k;
499         struct jset_entry *entry, *start, *end;
500
501         if (clean) {
502                 start = clean->start;
503                 end = vstruct_end(&clean->field);
504         } else {
505                 start = j->start;
506                 end = vstruct_last(j);
507         }
508
509         for (entry = start; entry < end; entry = vstruct_next(entry))
510                 if (entry->type == BCH_JSET_ENTRY_btree_root &&
511                     entry->btree_id == id)
512                         goto found;
513
514         return NULL;
515 found:
516         if (!entry->u64s)
517                 return ERR_PTR(-EINVAL);
518
519         k = entry->start;
520         *level = entry->level;
521         return k;
522 }
523
524 static int verify_superblock_clean(struct bch_fs *c,
525                                    struct bch_sb_field_clean **cleanp,
526                                    struct jset *j)
527 {
528         unsigned i;
529         struct bch_sb_field_clean *clean = *cleanp;
530         int ret = 0;
531
532         if (!c->sb.clean || !j)
533                 return 0;
534
535         if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
536                         "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
537                         le64_to_cpu(clean->journal_seq),
538                         le64_to_cpu(j->seq))) {
539                 kfree(clean);
540                 *cleanp = NULL;
541                 return 0;
542         }
543
544         mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
545                         "superblock read clock doesn't match journal after clean shutdown");
546         mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
547                         "superblock read clock doesn't match journal after clean shutdown");
548
549         for (i = 0; i < BTREE_ID_NR; i++) {
550                 struct bkey_i *k1, *k2;
551                 unsigned l1 = 0, l2 = 0;
552
553                 k1 = btree_root_find(c, clean, NULL, i, &l1);
554                 k2 = btree_root_find(c, NULL, j, i, &l2);
555
556                 if (!k1 && !k2)
557                         continue;
558
559                 mustfix_fsck_err_on(!k1 || !k2 ||
560                                     IS_ERR(k1) ||
561                                     IS_ERR(k2) ||
562                                     k1->k.u64s != k2->k.u64s ||
563                                     memcmp(k1, k2, bkey_bytes(k1)) ||
564                                     l1 != l2, c,
565                         "superblock btree root doesn't match journal after clean shutdown");
566         }
567 fsck_err:
568         return ret;
569 }
570
571 static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
572 {
573         struct bch_sb_field_clean *clean, *sb_clean;
574         int ret;
575
576         mutex_lock(&c->sb_lock);
577         sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
578
579         if (fsck_err_on(!sb_clean, c,
580                         "superblock marked clean but clean section not present")) {
581                 SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
582                 c->sb.clean = false;
583                 mutex_unlock(&c->sb_lock);
584                 return NULL;
585         }
586
587         clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
588                         GFP_KERNEL);
589         if (!clean) {
590                 mutex_unlock(&c->sb_lock);
591                 return ERR_PTR(-ENOMEM);
592         }
593
594         if (le16_to_cpu(c->disk_sb.sb->version) <
595             bcachefs_metadata_version_bkey_renumber)
596                 bch2_sb_clean_renumber(clean, READ);
597
598         mutex_unlock(&c->sb_lock);
599
600         return clean;
601 fsck_err:
602         mutex_unlock(&c->sb_lock);
603         return ERR_PTR(ret);
604 }
605
606 static int read_btree_roots(struct bch_fs *c)
607 {
608         unsigned i;
609         int ret = 0;
610
611         for (i = 0; i < BTREE_ID_NR; i++) {
612                 struct btree_root *r = &c->btree_roots[i];
613
614                 if (!r->alive)
615                         continue;
616
617                 if (i == BTREE_ID_ALLOC &&
618                     test_reconstruct_alloc(c)) {
619                         c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
620                         continue;
621                 }
622
623
624                 if (r->error) {
625                         __fsck_err(c, i == BTREE_ID_ALLOC
626                                    ? FSCK_CAN_IGNORE : 0,
627                                    "invalid btree root %s",
628                                    bch2_btree_ids[i]);
629                         if (i == BTREE_ID_ALLOC)
630                                 c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
631                 }
632
633                 ret = bch2_btree_root_read(c, i, &r->key, r->level);
634                 if (ret) {
635                         __fsck_err(c, i == BTREE_ID_ALLOC
636                                    ? FSCK_CAN_IGNORE : 0,
637                                    "error reading btree root %s",
638                                    bch2_btree_ids[i]);
639                         if (i == BTREE_ID_ALLOC)
640                                 c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
641                 }
642         }
643
644         for (i = 0; i < BTREE_ID_NR; i++)
645                 if (!c->btree_roots[i].b)
646                         bch2_btree_root_alloc(c, i);
647 fsck_err:
648         return ret;
649 }
650
651 int bch2_fs_recovery(struct bch_fs *c)
652 {
653         const char *err = "cannot allocate memory";
654         struct bch_sb_field_clean *clean = NULL;
655         u64 journal_seq;
656         LIST_HEAD(journal_entries);
657         struct journal_keys journal_keys = { NULL };
658         bool wrote = false, write_sb = false;
659         int ret;
660
661         if (c->sb.clean)
662                 clean = read_superblock_clean(c);
663         ret = PTR_ERR_OR_ZERO(clean);
664         if (ret)
665                 goto err;
666
667         if (c->sb.clean)
668                 bch_info(c, "recovering from clean shutdown, journal seq %llu",
669                          le64_to_cpu(clean->journal_seq));
670
671         if (!c->replicas.entries) {
672                 bch_info(c, "building replicas info");
673                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
674         }
675
676         if (!c->sb.clean || c->opts.fsck) {
677                 struct jset *j;
678
679                 ret = bch2_journal_read(c, &journal_entries);
680                 if (ret)
681                         goto err;
682
683                 if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&journal_entries), c,
684                                 "filesystem marked clean but journal not empty")) {
685                         c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO);
686                         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
687                         c->sb.clean = false;
688                 }
689
690                 if (!c->sb.clean && list_empty(&journal_entries)) {
691                         bch_err(c, "no journal entries found");
692                         ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
693                         goto err;
694                 }
695
696                 journal_keys = journal_keys_sort(&journal_entries);
697                 if (!journal_keys.d) {
698                         ret = -ENOMEM;
699                         goto err;
700                 }
701
702                 j = &list_last_entry(&journal_entries,
703                                      struct journal_replay, list)->j;
704
705                 ret = verify_superblock_clean(c, &clean, j);
706                 if (ret)
707                         goto err;
708
709                 journal_seq = le64_to_cpu(j->seq) + 1;
710         } else {
711                 journal_seq = le64_to_cpu(clean->journal_seq) + 1;
712         }
713
714         ret = journal_replay_early(c, clean, &journal_entries);
715         if (ret)
716                 goto err;
717
718         if (!c->sb.clean) {
719                 ret = bch2_journal_seq_blacklist_add(c,
720                                                      journal_seq,
721                                                      journal_seq + 4);
722                 if (ret) {
723                         bch_err(c, "error creating new journal seq blacklist entry");
724                         goto err;
725                 }
726
727                 journal_seq += 4;
728         }
729
730         ret = bch2_blacklist_table_initialize(c);
731
732         if (!list_empty(&journal_entries)) {
733                 ret = verify_journal_entries_not_blacklisted_or_missing(c,
734                                                         &journal_entries);
735                 if (ret)
736                         goto err;
737         }
738
739         ret = bch2_fs_journal_start(&c->journal, journal_seq,
740                                     &journal_entries);
741         if (ret)
742                 goto err;
743
744         ret = read_btree_roots(c);
745         if (ret)
746                 goto err;
747
748         bch_verbose(c, "starting alloc read");
749         err = "error reading allocation information";
750         ret = bch2_alloc_read(c, &journal_keys);
751         if (ret)
752                 goto err;
753         bch_verbose(c, "alloc read done");
754
755         bch_verbose(c, "starting stripes_read");
756         err = "error reading stripes";
757         ret = bch2_stripes_read(c, &journal_keys);
758         if (ret)
759                 goto err;
760         bch_verbose(c, "stripes_read done");
761
762         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
763
764         if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) &&
765             !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) {
766                 /*
767                  * interior btree node updates aren't consistent with the
768                  * journal; after an unclean shutdown we have to walk all
769                  * pointers to metadata:
770                  */
771                 bch_info(c, "starting metadata mark and sweep");
772                 err = "error in mark and sweep";
773                 ret = bch2_gc(c, NULL, true, true);
774                 if (ret)
775                         goto err;
776                 bch_verbose(c, "mark and sweep done");
777         }
778
779         if (c->opts.fsck ||
780             !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
781             test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
782                 bch_info(c, "starting mark and sweep");
783                 err = "error in mark and sweep";
784                 ret = bch2_gc(c, &journal_keys, true, false);
785                 if (ret)
786                         goto err;
787                 bch_verbose(c, "mark and sweep done");
788         }
789
790         clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
791         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
792
793         /*
794          * Skip past versions that might have possibly been used (as nonces),
795          * but hadn't had their pointers written:
796          */
797         if (c->sb.encryption_type && !c->sb.clean)
798                 atomic64_add(1 << 16, &c->key_version);
799
800         if (c->opts.norecovery)
801                 goto out;
802
803         bch_verbose(c, "starting journal replay");
804         err = "journal replay failed";
805         ret = bch2_journal_replay(c, journal_keys);
806         if (ret)
807                 goto err;
808         bch_verbose(c, "journal replay done");
809
810         if (!c->opts.nochanges) {
811                 /*
812                  * note that even when filesystem was clean there might be work
813                  * to do here, if we ran gc (because of fsck) which recalculated
814                  * oldest_gen:
815                  */
816                 bch_verbose(c, "writing allocation info");
817                 err = "error writing out alloc info";
818                 ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?:
819                         bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote);
820                 if (ret) {
821                         bch_err(c, "error writing alloc info");
822                         goto err;
823                 }
824                 bch_verbose(c, "alloc write done");
825         }
826
827         if (!c->sb.clean) {
828                 if (!(c->sb.features & (1 << BCH_FEATURE_ATOMIC_NLINK))) {
829                         bch_info(c, "checking inode link counts");
830                         err = "error in recovery";
831                         ret = bch2_fsck_inode_nlink(c);
832                         if (ret)
833                                 goto err;
834                         bch_verbose(c, "check inodes done");
835
836                 } else {
837                         bch_verbose(c, "checking for deleted inodes");
838                         err = "error in recovery";
839                         ret = bch2_fsck_walk_inodes_only(c);
840                         if (ret)
841                                 goto err;
842                         bch_verbose(c, "check inodes done");
843                 }
844         }
845
846         if (c->opts.fsck) {
847                 bch_info(c, "starting fsck");
848                 err = "error in fsck";
849                 ret = bch2_fsck_full(c);
850                 if (ret)
851                         goto err;
852                 bch_verbose(c, "fsck done");
853         }
854
855         if (enabled_qtypes(c)) {
856                 bch_verbose(c, "reading quotas");
857                 ret = bch2_fs_quota_read(c);
858                 if (ret)
859                         goto err;
860                 bch_verbose(c, "quotas done");
861         }
862
863         mutex_lock(&c->sb_lock);
864         if (c->opts.version_upgrade) {
865                 if (c->sb.version < bcachefs_metadata_version_new_versioning)
866                         c->disk_sb.sb->version_min =
867                                 le16_to_cpu(bcachefs_metadata_version_min);
868                 c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
869                 write_sb = true;
870         }
871
872         if (!test_bit(BCH_FS_ERROR, &c->flags)) {
873                 c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
874                 write_sb = true;
875         }
876
877         if (c->opts.fsck &&
878             !test_bit(BCH_FS_ERROR, &c->flags)) {
879                 c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
880                 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
881                 write_sb = true;
882         }
883
884         if (write_sb)
885                 bch2_write_super(c);
886         mutex_unlock(&c->sb_lock);
887
888         if (c->journal_seq_blacklist_table &&
889             c->journal_seq_blacklist_table->nr > 128)
890                 queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
891 out:
892         ret = 0;
893 err:
894 fsck_err:
895         bch2_flush_fsck_errs(c);
896         journal_keys_free(&journal_keys);
897         journal_entries_free(&journal_entries);
898         kfree(clean);
899         if (ret)
900                 bch_err(c, "Error in recovery: %s (%i)", err, ret);
901         else
902                 bch_verbose(c, "ret %i", ret);
903         return ret;
904 }
905
906 int bch2_fs_initialize(struct bch_fs *c)
907 {
908         struct bch_inode_unpacked root_inode, lostfound_inode;
909         struct bkey_inode_buf packed_inode;
910         struct bch_hash_info root_hash_info;
911         struct qstr lostfound = QSTR("lost+found");
912         const char *err = "cannot allocate memory";
913         struct bch_dev *ca;
914         LIST_HEAD(journal);
915         unsigned i;
916         int ret;
917
918         bch_notice(c, "initializing new filesystem");
919
920         mutex_lock(&c->sb_lock);
921         for_each_online_member(ca, c, i)
922                 bch2_mark_dev_superblock(c, ca, 0);
923         mutex_unlock(&c->sb_lock);
924
925         set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
926         set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
927
928         for (i = 0; i < BTREE_ID_NR; i++)
929                 bch2_btree_root_alloc(c, i);
930
931         err = "unable to allocate journal buckets";
932         for_each_online_member(ca, c, i) {
933                 ret = bch2_dev_journal_alloc(ca);
934                 if (ret) {
935                         percpu_ref_put(&ca->io_ref);
936                         goto err;
937                 }
938         }
939
940         /*
941          * journal_res_get() will crash if called before this has
942          * set up the journal.pin FIFO and journal.cur pointer:
943          */
944         bch2_fs_journal_start(&c->journal, 1, &journal);
945         bch2_journal_set_replay_done(&c->journal);
946
947         err = "error going read write";
948         ret = __bch2_fs_read_write(c, true);
949         if (ret)
950                 goto err;
951
952         bch2_inode_init(c, &root_inode, 0, 0,
953                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
954         root_inode.bi_inum = BCACHEFS_ROOT_INO;
955         root_inode.bi_nlink++; /* lost+found */
956         bch2_inode_pack(&packed_inode, &root_inode);
957
958         err = "error creating root directory";
959         ret = bch2_btree_insert(c, BTREE_ID_INODES,
960                                 &packed_inode.inode.k_i,
961                                 NULL, NULL, 0);
962         if (ret)
963                 goto err;
964
965         bch2_inode_init(c, &lostfound_inode, 0, 0,
966                         S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
967                         &root_inode);
968         lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
969         bch2_inode_pack(&packed_inode, &lostfound_inode);
970
971         err = "error creating lost+found";
972         ret = bch2_btree_insert(c, BTREE_ID_INODES,
973                                 &packed_inode.inode.k_i,
974                                 NULL, NULL, 0);
975         if (ret)
976                 goto err;
977
978         root_hash_info = bch2_hash_info_init(c, &root_inode);
979
980         ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
981                                  &lostfound, lostfound_inode.bi_inum, NULL,
982                                  BTREE_INSERT_NOFAIL);
983         if (ret)
984                 goto err;
985
986         if (enabled_qtypes(c)) {
987                 ret = bch2_fs_quota_read(c);
988                 if (ret)
989                         goto err;
990         }
991
992         err = "error writing first journal entry";
993         ret = bch2_journal_meta(&c->journal);
994         if (ret)
995                 goto err;
996
997         mutex_lock(&c->sb_lock);
998         c->disk_sb.sb->version = c->disk_sb.sb->version_min =
999                 le16_to_cpu(bcachefs_metadata_version_current);
1000         c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_ATOMIC_NLINK;
1001
1002         SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
1003         SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
1004
1005         bch2_write_super(c);
1006         mutex_unlock(&c->sb_lock);
1007
1008         return 0;
1009 err:
1010         pr_err("Error initializing new filesystem: %s (%i)", err, ret);
1011         return ret;
1012 }