]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/super-io.c
Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring
[bcachefs-tools-debian] / libbcachefs / super-io.c
1
2 #include "bcachefs.h"
3 #include "checksum.h"
4 #include "error.h"
5 #include "io.h"
6 #include "super-io.h"
7 #include "super.h"
8 #include "vstructs.h"
9
10 #include <linux/backing-dev.h>
11 #include <linux/sort.h>
12
13 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
14 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
15                                             struct bch_replicas_cpu *);
16 static int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
17
18 /* superblock fields (optional/variable size sections: */
19
20 const char * const bch2_sb_fields[] = {
21 #define x(name, nr)     #name,
22         BCH_SB_FIELDS()
23 #undef x
24         NULL
25 };
26
27 #define x(f, nr)                                        \
28 static const char *bch2_sb_validate_##f(struct bch_sb *, struct bch_sb_field *);
29         BCH_SB_FIELDS()
30 #undef x
31
32 struct bch_sb_field_ops {
33         const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
34 };
35
36 static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
37 #define x(f, nr)                                        \
38         [BCH_SB_FIELD_##f] = {                          \
39                 .validate = bch2_sb_validate_##f,       \
40         },
41         BCH_SB_FIELDS()
42 #undef x
43 };
44
45 static const char *bch2_sb_field_validate(struct bch_sb *sb,
46                                           struct bch_sb_field *f)
47
48 {
49         unsigned type = le32_to_cpu(f->type);
50
51         return type < BCH_SB_FIELD_NR
52                 ? bch2_sb_field_ops[type].validate(sb, f)
53                 : NULL;
54 }
55
56 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
57                                       enum bch_sb_field_type type)
58 {
59         struct bch_sb_field *f;
60
61         /* XXX: need locking around superblock to access optional fields */
62
63         vstruct_for_each(sb, f)
64                 if (le32_to_cpu(f->type) == type)
65                         return f;
66         return NULL;
67 }
68
69 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
70                                                   struct bch_sb_field *f,
71                                                   unsigned u64s)
72 {
73         unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
74
75         if (!f) {
76                 f = vstruct_last(sb);
77                 memset(f, 0, sizeof(u64) * u64s);
78                 f->u64s = cpu_to_le32(u64s);
79                 f->type = 0;
80         } else {
81                 void *src, *dst;
82
83                 src = vstruct_end(f);
84                 f->u64s = cpu_to_le32(u64s);
85                 dst = vstruct_end(f);
86
87                 memmove(dst, src, vstruct_end(sb) - src);
88
89                 if (dst > src)
90                         memset(src, 0, dst - src);
91         }
92
93         le32_add_cpu(&sb->u64s, u64s - old_u64s);
94
95         return f;
96 }
97
98 /* Superblock realloc/free: */
99
100 void bch2_free_super(struct bch_sb_handle *sb)
101 {
102         if (sb->bio)
103                 bio_put(sb->bio);
104         if (!IS_ERR_OR_NULL(sb->bdev))
105                 blkdev_put(sb->bdev, sb->mode);
106
107         free_pages((unsigned long) sb->sb, sb->page_order);
108         memset(sb, 0, sizeof(*sb));
109 }
110
111 static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
112 {
113         struct bch_sb *new_sb;
114         struct bio *bio;
115
116         if (sb->page_order >= order && sb->sb)
117                 return 0;
118
119         if (dynamic_fault("bcachefs:add:super_realloc"))
120                 return -ENOMEM;
121
122         bio = bio_kmalloc(GFP_KERNEL, 1 << order);
123         if (!bio)
124                 return -ENOMEM;
125
126         if (sb->bio)
127                 bio_put(sb->bio);
128         sb->bio = bio;
129
130         new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
131         if (!new_sb)
132                 return -ENOMEM;
133
134         if (sb->sb)
135                 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
136
137         free_pages((unsigned long) sb->sb, sb->page_order);
138         sb->sb = new_sb;
139
140         sb->page_order = order;
141
142         return 0;
143 }
144
145 static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
146 {
147         u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
148         u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
149
150         if (new_bytes > max_bytes) {
151                 char buf[BDEVNAME_SIZE];
152
153                 pr_err("%s: superblock too big: want %llu but have %llu",
154                        bdevname(sb->bdev, buf), new_bytes, max_bytes);
155                 return -ENOSPC;
156         }
157
158         return __bch2_super_realloc(sb, get_order(new_bytes));
159 }
160
161 static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
162 {
163         u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
164         struct bch_sb *sb;
165         unsigned order = get_order(bytes);
166
167         if (c->disk_sb && order <= c->disk_sb_order)
168                 return 0;
169
170         sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
171         if (!sb)
172                 return -ENOMEM;
173
174         if (c->disk_sb)
175                 memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
176
177         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
178
179         c->disk_sb = sb;
180         c->disk_sb_order = order;
181         return 0;
182 }
183
184 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
185                                           enum bch_sb_field_type type,
186                                           unsigned u64s)
187 {
188         struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
189         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
190         ssize_t d = -old_u64s + u64s;
191
192         if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
193                 return NULL;
194
195         f = __bch2_sb_field_resize(sb->sb, f, u64s);
196         f->type = cpu_to_le32(type);
197         return f;
198 }
199
200 struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
201                                             enum bch_sb_field_type type,
202                                             unsigned u64s)
203 {
204         struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
205         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
206         ssize_t d = -old_u64s + u64s;
207         struct bch_dev *ca;
208         unsigned i;
209
210         lockdep_assert_held(&c->sb_lock);
211
212         if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
213                 return NULL;
214
215         /* XXX: we're not checking that offline device have enough space */
216
217         for_each_online_member(ca, c, i) {
218                 struct bch_sb_handle *sb = &ca->disk_sb;
219
220                 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
221                         percpu_ref_put(&ca->ref);
222                         return NULL;
223                 }
224         }
225
226         f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
227         f->type = cpu_to_le32(type);
228         return f;
229 }
230
231 /* Superblock validate: */
232
233 static inline void __bch2_sb_layout_size_assert(void)
234 {
235         BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
236 }
237
238 static const char *validate_sb_layout(struct bch_sb_layout *layout)
239 {
240         u64 offset, prev_offset, max_sectors;
241         unsigned i;
242
243         if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
244                 return "Not a bcachefs superblock layout";
245
246         if (layout->layout_type != 0)
247                 return "Invalid superblock layout type";
248
249         if (!layout->nr_superblocks)
250                 return "Invalid superblock layout: no superblocks";
251
252         if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
253                 return "Invalid superblock layout: too many superblocks";
254
255         max_sectors = 1 << layout->sb_max_size_bits;
256
257         prev_offset = le64_to_cpu(layout->sb_offset[0]);
258
259         for (i = 1; i < layout->nr_superblocks; i++) {
260                 offset = le64_to_cpu(layout->sb_offset[i]);
261
262                 if (offset < prev_offset + max_sectors)
263                         return "Invalid superblock layout: superblocks overlap";
264                 prev_offset = offset;
265         }
266
267         return NULL;
268 }
269
270 const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
271 {
272         struct bch_sb *sb = disk_sb->sb;
273         struct bch_sb_field *f;
274         struct bch_sb_field_members *mi;
275         const char *err;
276         u16 block_size;
277
278         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
279             le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
280                 return"Unsupported superblock version";
281
282         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
283                 SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
284                 SET_BCH_SB_POSIX_ACL(sb, 1);
285         }
286
287         block_size = le16_to_cpu(sb->block_size);
288
289         if (!is_power_of_2(block_size) ||
290             block_size > PAGE_SECTORS)
291                 return "Bad block size";
292
293         if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
294                 return "Bad user UUID";
295
296         if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
297                 return "Bad internal UUID";
298
299         if (!sb->nr_devices ||
300             sb->nr_devices <= sb->dev_idx ||
301             sb->nr_devices > BCH_SB_MEMBERS_MAX)
302                 return "Bad number of member devices";
303
304         if (!BCH_SB_META_REPLICAS_WANT(sb) ||
305             BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
306                 return "Invalid number of metadata replicas";
307
308         if (!BCH_SB_META_REPLICAS_REQ(sb) ||
309             BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
310                 return "Invalid number of metadata replicas";
311
312         if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
313             BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
314                 return "Invalid number of data replicas";
315
316         if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
317             BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
318                 return "Invalid number of data replicas";
319
320         if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
321                 return "Invalid metadata checksum type";
322
323         if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
324                 return "Invalid metadata checksum type";
325
326         if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
327                 return "Invalid compression type";
328
329         if (!BCH_SB_BTREE_NODE_SIZE(sb))
330                 return "Btree node size not set";
331
332         if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
333                 return "Btree node size not a power of two";
334
335         if (BCH_SB_GC_RESERVE(sb) < 5)
336                 return "gc reserve percentage too small";
337
338         if (!sb->time_precision ||
339             le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
340                 return "invalid time precision";
341
342         /* validate layout */
343         err = validate_sb_layout(&sb->layout);
344         if (err)
345                 return err;
346
347         vstruct_for_each(sb, f) {
348                 if (!f->u64s)
349                         return "Invalid superblock: invalid optional field";
350
351                 if (vstruct_next(f) > vstruct_last(sb))
352                         return "Invalid superblock: invalid optional field";
353         }
354
355         /* members must be validated first: */
356         mi = bch2_sb_get_members(sb);
357         if (!mi)
358                 return "Invalid superblock: member info area missing";
359
360         err = bch2_sb_field_validate(sb, &mi->field);
361         if (err)
362                 return err;
363
364         vstruct_for_each(sb, f) {
365                 if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
366                         continue;
367
368                 err = bch2_sb_field_validate(sb, f);
369                 if (err)
370                         return err;
371         }
372
373         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
374             bch2_sb_get_crypt(sb) &&
375             BCH_SB_INITIALIZED(sb))
376                 return "Incompatible extent nonces";
377
378         sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
379
380         return NULL;
381 }
382
383 /* device open: */
384
385 static void bch2_sb_update(struct bch_fs *c)
386 {
387         struct bch_sb *src = c->disk_sb;
388         struct bch_sb_field_members *mi = bch2_sb_get_members(src);
389         struct bch_dev *ca;
390         unsigned i;
391
392         lockdep_assert_held(&c->sb_lock);
393
394         c->sb.uuid              = src->uuid;
395         c->sb.user_uuid         = src->user_uuid;
396         c->sb.nr_devices        = src->nr_devices;
397         c->sb.clean             = BCH_SB_CLEAN(src);
398         c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
399         c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
400         c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
401         c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
402         c->sb.time_precision    = le32_to_cpu(src->time_precision);
403
404         for_each_member_device(ca, c, i)
405                 ca->mi = bch2_mi_to_cpu(mi->members + i);
406 }
407
408 /* doesn't copy member info */
409 static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
410 {
411         struct bch_sb_field *src_f, *dst_f;
412
413         dst->version            = src->version;
414         dst->seq                = src->seq;
415         dst->uuid               = src->uuid;
416         dst->user_uuid          = src->user_uuid;
417         memcpy(dst->label,      src->label, sizeof(dst->label));
418
419         dst->block_size         = src->block_size;
420         dst->nr_devices         = src->nr_devices;
421
422         dst->time_base_lo       = src->time_base_lo;
423         dst->time_base_hi       = src->time_base_hi;
424         dst->time_precision     = src->time_precision;
425
426         memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
427         memcpy(dst->features,   src->features,  sizeof(dst->features));
428         memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
429
430         vstruct_for_each(src, src_f) {
431                 if (src_f->type == BCH_SB_FIELD_journal)
432                         continue;
433
434                 dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
435                 dst_f = __bch2_sb_field_resize(dst, dst_f,
436                                 le32_to_cpu(src_f->u64s));
437
438                 memcpy(dst_f, src_f, vstruct_bytes(src_f));
439         }
440 }
441
442 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
443 {
444         struct bch_sb_field_journal *journal_buckets =
445                 bch2_sb_get_journal(src);
446         unsigned journal_u64s = journal_buckets
447                 ? le32_to_cpu(journal_buckets->field.u64s)
448                 : 0;
449         int ret;
450
451         lockdep_assert_held(&c->sb_lock);
452
453         ret = bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s);
454         if (ret)
455                 return ret;
456
457         __copy_super(c->disk_sb, src);
458
459         ret = bch2_sb_replicas_to_cpu_replicas(c);
460         if (ret)
461                 return ret;
462
463         ret = bch2_sb_disk_groups_to_cpu(c);
464         if (ret)
465                 return ret;
466
467         bch2_sb_update(c);
468         return 0;
469 }
470
471 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
472 {
473         struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
474         struct bch_sb_field_journal *journal_buckets =
475                 bch2_sb_get_journal(dst);
476         unsigned journal_u64s = journal_buckets
477                 ? le32_to_cpu(journal_buckets->field.u64s)
478                 : 0;
479         unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
480         int ret;
481
482         ret = bch2_sb_realloc(&ca->disk_sb, u64s);
483         if (ret)
484                 return ret;
485
486         __copy_super(dst, src);
487         return 0;
488 }
489
490 /* read superblock: */
491
492 static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
493 {
494         struct bch_csum csum;
495         size_t bytes;
496         unsigned order;
497 reread:
498         bio_reset(sb->bio);
499         bio_set_dev(sb->bio, sb->bdev);
500         sb->bio->bi_iter.bi_sector = offset;
501         sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
502         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
503         bch2_bio_map(sb->bio, sb->sb);
504
505         if (submit_bio_wait(sb->bio))
506                 return "IO error";
507
508         if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
509                 return "Not a bcachefs superblock";
510
511         if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
512             le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
513                 return"Unsupported superblock version";
514
515         bytes = vstruct_bytes(sb->sb);
516
517         if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
518                 return "Bad superblock: too big";
519
520         order = get_order(bytes);
521         if (order > sb->page_order) {
522                 if (__bch2_super_realloc(sb, order))
523                         return "cannot allocate memory";
524                 goto reread;
525         }
526
527         if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
528                 return "unknown csum type";
529
530         /* XXX: verify MACs */
531         csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
532                             null_nonce(), sb->sb);
533
534         if (bch2_crc_cmp(csum, sb->sb->csum))
535                 return "bad checksum reading superblock";
536
537         return NULL;
538 }
539
540 int bch2_read_super(const char *path, struct bch_opts *opts,
541                     struct bch_sb_handle *sb)
542 {
543         u64 offset = opt_get(*opts, sb);
544         struct bch_sb_layout layout;
545         const char *err;
546         __le64 *i;
547         int ret;
548
549         pr_verbose_init(*opts, "");
550
551         memset(sb, 0, sizeof(*sb));
552         sb->mode = FMODE_READ;
553
554         if (!opt_get(*opts, noexcl))
555                 sb->mode |= FMODE_EXCL;
556
557         if (!opt_get(*opts, nochanges))
558                 sb->mode |= FMODE_WRITE;
559
560         sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
561         if (IS_ERR(sb->bdev) &&
562             PTR_ERR(sb->bdev) == -EACCES &&
563             opt_get(*opts, read_only)) {
564                 sb->mode &= ~FMODE_WRITE;
565
566                 sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
567                 if (!IS_ERR(sb->bdev))
568                         opt_set(*opts, nochanges, true);
569         }
570
571         if (IS_ERR(sb->bdev)) {
572                 ret = PTR_ERR(sb->bdev);
573                 goto out;
574         }
575
576         err = "cannot allocate memory";
577         ret = __bch2_super_realloc(sb, 0);
578         if (ret)
579                 goto err;
580
581         ret = -EFAULT;
582         err = "dynamic fault";
583         if (bch2_fs_init_fault("read_super"))
584                 goto err;
585
586         ret = -EINVAL;
587         err = read_one_super(sb, offset);
588         if (!err)
589                 goto got_super;
590
591         if (opt_defined(*opts, sb))
592                 goto err;
593
594         pr_err("error reading default superblock: %s", err);
595
596         /*
597          * Error reading primary superblock - read location of backup
598          * superblocks:
599          */
600         bio_reset(sb->bio);
601         bio_set_dev(sb->bio, sb->bdev);
602         sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
603         sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
604         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
605         /*
606          * use sb buffer to read layout, since sb buffer is page aligned but
607          * layout won't be:
608          */
609         bch2_bio_map(sb->bio, sb->sb);
610
611         err = "IO error";
612         if (submit_bio_wait(sb->bio))
613                 goto err;
614
615         memcpy(&layout, sb->sb, sizeof(layout));
616         err = validate_sb_layout(&layout);
617         if (err)
618                 goto err;
619
620         for (i = layout.sb_offset;
621              i < layout.sb_offset + layout.nr_superblocks; i++) {
622                 offset = le64_to_cpu(*i);
623
624                 if (offset == opt_get(*opts, sb))
625                         continue;
626
627                 err = read_one_super(sb, offset);
628                 if (!err)
629                         goto got_super;
630         }
631
632         ret = -EINVAL;
633         goto err;
634
635 got_super:
636         err = "Superblock block size smaller than device block size";
637         ret = -EINVAL;
638         if (le16_to_cpu(sb->sb->block_size) << 9 <
639             bdev_logical_block_size(sb->bdev))
640                 goto err;
641
642         if (sb->mode & FMODE_WRITE)
643                 bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
644                         |= BDI_CAP_STABLE_WRITES;
645         ret = 0;
646 out:
647         pr_verbose_init(*opts, "ret %i", ret);
648         return ret;
649 err:
650         bch2_free_super(sb);
651         pr_err("error reading superblock: %s", err);
652         goto out;
653 }
654
655 /* write superblock: */
656
657 static void write_super_endio(struct bio *bio)
658 {
659         struct bch_dev *ca = bio->bi_private;
660
661         /* XXX: return errors directly */
662
663         if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
664                 ca->sb_write_error = 1;
665
666         closure_put(&ca->fs->sb_write);
667         percpu_ref_put(&ca->io_ref);
668 }
669
670 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
671 {
672         struct bch_sb *sb = ca->disk_sb.sb;
673         struct bio *bio = ca->disk_sb.bio;
674
675         sb->offset = sb->layout.sb_offset[idx];
676
677         SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
678         sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
679                                 null_nonce(), sb);
680
681         bio_reset(bio);
682         bio_set_dev(bio, ca->disk_sb.bdev);
683         bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
684         bio->bi_iter.bi_size    =
685                 roundup(vstruct_bytes(sb),
686                         bdev_logical_block_size(ca->disk_sb.bdev));
687         bio->bi_end_io          = write_super_endio;
688         bio->bi_private         = ca;
689         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
690         bch2_bio_map(bio, sb);
691
692         this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
693                      bio_sectors(bio));
694
695         percpu_ref_get(&ca->io_ref);
696         closure_bio_submit(bio, &c->sb_write);
697 }
698
699 void bch2_write_super(struct bch_fs *c)
700 {
701         struct closure *cl = &c->sb_write;
702         struct bch_dev *ca;
703         unsigned i, sb = 0, nr_wrote;
704         const char *err;
705         struct bch_devs_mask sb_written;
706         bool wrote, can_mount_without_written, can_mount_with_written;
707
708         lockdep_assert_held(&c->sb_lock);
709
710         closure_init_stack(cl);
711         memset(&sb_written, 0, sizeof(sb_written));
712
713         le64_add_cpu(&c->disk_sb->seq, 1);
714
715         for_each_online_member(ca, c, i)
716                 bch2_sb_from_fs(c, ca);
717
718         for_each_online_member(ca, c, i) {
719                 err = bch2_sb_validate(&ca->disk_sb);
720                 if (err) {
721                         bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
722                         goto out;
723                 }
724         }
725
726         if (c->opts.nochanges ||
727             test_bit(BCH_FS_ERROR, &c->flags))
728                 goto out;
729
730         for_each_online_member(ca, c, i) {
731                 __set_bit(ca->dev_idx, sb_written.d);
732                 ca->sb_write_error = 0;
733         }
734
735         do {
736                 wrote = false;
737                 for_each_online_member(ca, c, i)
738                         if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
739                                 write_one_super(c, ca, sb);
740                                 wrote = true;
741                         }
742                 closure_sync(cl);
743                 sb++;
744         } while (wrote);
745
746         for_each_online_member(ca, c, i)
747                 if (ca->sb_write_error)
748                         __clear_bit(ca->dev_idx, sb_written.d);
749
750         nr_wrote = dev_mask_nr(&sb_written);
751
752         can_mount_with_written =
753                 bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
754                                       BCH_FORCE_IF_DEGRADED);
755
756         for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
757                 sb_written.d[i] = ~sb_written.d[i];
758
759         can_mount_without_written =
760                 bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
761                                       BCH_FORCE_IF_DEGRADED);
762
763         /*
764          * If we would be able to mount _without_ the devices we successfully
765          * wrote superblocks to, we weren't able to write to enough devices:
766          *
767          * Exception: if we can mount without the successes because we haven't
768          * written anything (new filesystem), we continue if we'd be able to
769          * mount with the devices we did successfully write to:
770          */
771         bch2_fs_fatal_err_on(!nr_wrote ||
772                              (can_mount_without_written &&
773                               !can_mount_with_written), c,
774                 "Unable to write superblock to sufficient devices");
775 out:
776         /* Make new options visible after they're persistent: */
777         bch2_sb_update(c);
778 }
779
780 /* BCH_SB_FIELD_journal: */
781
782 static int u64_cmp(const void *_l, const void *_r)
783 {
784         u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
785
786         return l < r ? -1 : l > r ? 1 : 0;
787 }
788
789 static const char *bch2_sb_validate_journal(struct bch_sb *sb,
790                                             struct bch_sb_field *f)
791 {
792         struct bch_sb_field_journal *journal = field_to_type(f, journal);
793         struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
794         const char *err;
795         unsigned nr;
796         unsigned i;
797         u64 *b;
798
799         journal = bch2_sb_get_journal(sb);
800         if (!journal)
801                 return NULL;
802
803         nr = bch2_nr_journal_buckets(journal);
804         if (!nr)
805                 return NULL;
806
807         b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
808         if (!b)
809                 return "cannot allocate memory";
810
811         for (i = 0; i < nr; i++)
812                 b[i] = le64_to_cpu(journal->buckets[i]);
813
814         sort(b, nr, sizeof(u64), u64_cmp, NULL);
815
816         err = "journal bucket at sector 0";
817         if (!b[0])
818                 goto err;
819
820         err = "journal bucket before first bucket";
821         if (m && b[0] < le16_to_cpu(m->first_bucket))
822                 goto err;
823
824         err = "journal bucket past end of device";
825         if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
826                 goto err;
827
828         err = "duplicate journal buckets";
829         for (i = 0; i + 1 < nr; i++)
830                 if (b[i] == b[i + 1])
831                         goto err;
832
833         err = NULL;
834 err:
835         kfree(b);
836         return err;
837 }
838
839 /* BCH_SB_FIELD_members: */
840
841 static const char *bch2_sb_validate_members(struct bch_sb *sb,
842                                             struct bch_sb_field *f)
843 {
844         struct bch_sb_field_members *mi = field_to_type(f, members);
845         struct bch_member *m;
846
847         if ((void *) (mi->members + sb->nr_devices) >
848             vstruct_end(&mi->field))
849                 return "Invalid superblock: bad member info";
850
851         for (m = mi->members;
852              m < mi->members + sb->nr_devices;
853              m++) {
854                 if (!bch2_member_exists(m))
855                         continue;
856
857                 if (le64_to_cpu(m->nbuckets) > LONG_MAX)
858                         return "Too many buckets";
859
860                 if (le64_to_cpu(m->nbuckets) -
861                     le16_to_cpu(m->first_bucket) < 1 << 10)
862                         return "Not enough buckets";
863
864                 if (le16_to_cpu(m->bucket_size) <
865                     le16_to_cpu(sb->block_size))
866                         return "bucket size smaller than block size";
867
868                 if (le16_to_cpu(m->bucket_size) <
869                     BCH_SB_BTREE_NODE_SIZE(sb))
870                         return "bucket size smaller than btree node size";
871         }
872
873         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
874                 for (m = mi->members;
875                      m < mi->members + sb->nr_devices;
876                      m++)
877                         SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
878
879         return NULL;
880 }
881
882 /* BCH_SB_FIELD_crypt: */
883
884 static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
885                                           struct bch_sb_field *f)
886 {
887         struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
888
889         if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
890                 return "invalid field crypt: wrong size";
891
892         if (BCH_CRYPT_KDF_TYPE(crypt))
893                 return "invalid field crypt: bad kdf type";
894
895         return NULL;
896 }
897
898 /* BCH_SB_FIELD_replicas: */
899
900 /* Replicas tracking - in memory: */
901
902 #define for_each_cpu_replicas_entry(_r, _i)                             \
903         for (_i = (_r)->entries;                                        \
904              (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
905              _i = (void *) (_i) + (_r)->entry_size)
906
907 static inline struct bch_replicas_cpu_entry *
908 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
909 {
910         return (void *) r->entries + r->entry_size * i;
911 }
912
913 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
914 {
915         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
916 }
917
918 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
919                                      unsigned dev)
920 {
921         return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
922 }
923
924 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
925                                     unsigned dev)
926 {
927         e->devs[dev >> 3] |= 1 << (dev & 7);
928 }
929
930 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
931 {
932         return (r->entry_size -
933                 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
934 }
935
936 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
937                               char *buf, size_t size)
938 {
939         char *out = buf, *end = out + size;
940         struct bch_replicas_cpu_entry *e;
941         bool first = true;
942         unsigned i;
943
944         for_each_cpu_replicas_entry(r, e) {
945                 bool first_e = true;
946
947                 if (!first)
948                         out += scnprintf(out, end - out, " ");
949                 first = false;
950
951                 out += scnprintf(out, end - out, "%u: [", e->data_type);
952
953                 for (i = 0; i < replicas_dev_slots(r); i++)
954                         if (replicas_test_dev(e, i)) {
955                                 if (!first_e)
956                                         out += scnprintf(out, end - out, " ");
957                                 first_e = false;
958                                 out += scnprintf(out, end - out, "%u", i);
959                         }
960                 out += scnprintf(out, end - out, "]");
961         }
962
963         return out - buf;
964 }
965
966 static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
967                                         enum bch_data_type data_type,
968                                         struct bch_replicas_cpu_entry *r,
969                                         unsigned *max_dev)
970 {
971         const struct bch_extent_ptr *ptr;
972         unsigned nr = 0;
973
974         BUG_ON(!data_type ||
975                data_type == BCH_DATA_SB ||
976                data_type >= BCH_DATA_NR);
977
978         memset(r, 0, sizeof(*r));
979         r->data_type = data_type;
980
981         *max_dev = 0;
982
983         extent_for_each_ptr(e, ptr)
984                 if (!ptr->cached) {
985                         *max_dev = max_t(unsigned, *max_dev, ptr->dev);
986                         replicas_set_dev(r, ptr->dev);
987                         nr++;
988                 }
989         return nr;
990 }
991
992 static inline void devlist_to_replicas(struct bch_devs_list devs,
993                                        enum bch_data_type data_type,
994                                        struct bch_replicas_cpu_entry *r,
995                                        unsigned *max_dev)
996 {
997         unsigned i;
998
999         BUG_ON(!data_type ||
1000                data_type == BCH_DATA_SB ||
1001                data_type >= BCH_DATA_NR);
1002
1003         memset(r, 0, sizeof(*r));
1004         r->data_type = data_type;
1005
1006         *max_dev = 0;
1007
1008         for (i = 0; i < devs.nr; i++) {
1009                 *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
1010                 replicas_set_dev(r, devs.devs[i]);
1011         }
1012 }
1013
1014 static struct bch_replicas_cpu *
1015 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
1016                        struct bch_replicas_cpu_entry new_entry,
1017                        unsigned max_dev)
1018 {
1019         struct bch_replicas_cpu *new;
1020         unsigned i, nr, entry_size;
1021
1022         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
1023                 DIV_ROUND_UP(max_dev + 1, 8);
1024         entry_size = max(entry_size, old->entry_size);
1025         nr = old->nr + 1;
1026
1027         new = kzalloc(sizeof(struct bch_replicas_cpu) +
1028                       nr * entry_size, GFP_NOIO);
1029         if (!new)
1030                 return NULL;
1031
1032         new->nr         = nr;
1033         new->entry_size = entry_size;
1034
1035         for (i = 0; i < old->nr; i++)
1036                 memcpy(cpu_replicas_entry(new, i),
1037                        cpu_replicas_entry(old, i),
1038                        min(new->entry_size, old->entry_size));
1039
1040         memcpy(cpu_replicas_entry(new, old->nr),
1041                &new_entry,
1042                new->entry_size);
1043
1044         bch2_cpu_replicas_sort(new);
1045         return new;
1046 }
1047
1048 static bool replicas_has_entry(struct bch_replicas_cpu *r,
1049                                 struct bch_replicas_cpu_entry search,
1050                                 unsigned max_dev)
1051 {
1052         return max_dev < replicas_dev_slots(r) &&
1053                 eytzinger0_find(r->entries, r->nr,
1054                                 r->entry_size,
1055                                 memcmp, &search) < r->nr;
1056 }
1057
1058 noinline
1059 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
1060                                 struct bch_replicas_cpu_entry new_entry,
1061                                 unsigned max_dev)
1062 {
1063         struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
1064         int ret = -ENOMEM;
1065
1066         mutex_lock(&c->sb_lock);
1067
1068         old_gc = rcu_dereference_protected(c->replicas_gc,
1069                                            lockdep_is_held(&c->sb_lock));
1070         if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
1071                 new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
1072                 if (!new_gc)
1073                         goto err;
1074         }
1075
1076         old_r = rcu_dereference_protected(c->replicas,
1077                                           lockdep_is_held(&c->sb_lock));
1078         if (!replicas_has_entry(old_r, new_entry, max_dev)) {
1079                 new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
1080                 if (!new_r)
1081                         goto err;
1082
1083                 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
1084                 if (ret)
1085                         goto err;
1086         }
1087
1088         /* allocations done, now commit: */
1089
1090         if (new_r)
1091                 bch2_write_super(c);
1092
1093         /* don't update in memory replicas until changes are persistent */
1094
1095         if (new_gc) {
1096                 rcu_assign_pointer(c->replicas_gc, new_gc);
1097                 kfree_rcu(old_gc, rcu);
1098         }
1099
1100         if (new_r) {
1101                 rcu_assign_pointer(c->replicas, new_r);
1102                 kfree_rcu(old_r, rcu);
1103         }
1104
1105         mutex_unlock(&c->sb_lock);
1106         return 0;
1107 err:
1108         mutex_unlock(&c->sb_lock);
1109         if (new_gc)
1110                 kfree(new_gc);
1111         if (new_r)
1112                 kfree(new_r);
1113         return ret;
1114 }
1115
1116 int bch2_mark_replicas(struct bch_fs *c,
1117                        enum bch_data_type data_type,
1118                        struct bch_devs_list devs)
1119 {
1120         struct bch_replicas_cpu_entry search;
1121         struct bch_replicas_cpu *r, *gc_r;
1122         unsigned max_dev;
1123         bool marked;
1124
1125         if (!devs.nr)
1126                 return 0;
1127
1128         BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
1129
1130         devlist_to_replicas(devs, data_type, &search, &max_dev);
1131
1132         rcu_read_lock();
1133         r = rcu_dereference(c->replicas);
1134         gc_r = rcu_dereference(c->replicas_gc);
1135         marked = replicas_has_entry(r, search, max_dev) &&
1136                 (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
1137         rcu_read_unlock();
1138
1139         return likely(marked) ? 0
1140                 : bch2_mark_replicas_slowpath(c, search, max_dev);
1141 }
1142
1143 int bch2_mark_bkey_replicas(struct bch_fs *c,
1144                             enum bch_data_type data_type,
1145                             struct bkey_s_c k)
1146 {
1147         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
1148         unsigned i;
1149         int ret;
1150
1151         for (i = 0; i < cached.nr; i++)
1152                 if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
1153                                               bch2_dev_list_single(cached.devs[i]))))
1154                         return ret;
1155
1156         return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
1157 }
1158
1159 int bch2_replicas_gc_end(struct bch_fs *c, int err)
1160 {
1161         struct bch_replicas_cpu *new_r, *old_r;
1162         int ret = 0;
1163
1164         lockdep_assert_held(&c->replicas_gc_lock);
1165
1166         mutex_lock(&c->sb_lock);
1167
1168         new_r = rcu_dereference_protected(c->replicas_gc,
1169                                           lockdep_is_held(&c->sb_lock));
1170
1171         if (err) {
1172                 rcu_assign_pointer(c->replicas_gc, NULL);
1173                 kfree_rcu(new_r, rcu);
1174                 goto err;
1175         }
1176
1177         if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
1178                 ret = -ENOSPC;
1179                 goto err;
1180         }
1181
1182         old_r = rcu_dereference_protected(c->replicas,
1183                                           lockdep_is_held(&c->sb_lock));
1184
1185         rcu_assign_pointer(c->replicas, new_r);
1186         rcu_assign_pointer(c->replicas_gc, NULL);
1187         kfree_rcu(old_r, rcu);
1188
1189         bch2_write_super(c);
1190 err:
1191         mutex_unlock(&c->sb_lock);
1192         return ret;
1193 }
1194
1195 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
1196 {
1197         struct bch_replicas_cpu *dst, *src;
1198         struct bch_replicas_cpu_entry *e;
1199
1200         lockdep_assert_held(&c->replicas_gc_lock);
1201
1202         mutex_lock(&c->sb_lock);
1203         BUG_ON(c->replicas_gc);
1204
1205         src = rcu_dereference_protected(c->replicas,
1206                                         lockdep_is_held(&c->sb_lock));
1207
1208         dst = kzalloc(sizeof(struct bch_replicas_cpu) +
1209                       src->nr * src->entry_size, GFP_NOIO);
1210         if (!dst) {
1211                 mutex_unlock(&c->sb_lock);
1212                 return -ENOMEM;
1213         }
1214
1215         dst->nr         = 0;
1216         dst->entry_size = src->entry_size;
1217
1218         for_each_cpu_replicas_entry(src, e)
1219                 if (!((1 << e->data_type) & typemask))
1220                         memcpy(cpu_replicas_entry(dst, dst->nr++),
1221                                e, dst->entry_size);
1222
1223         bch2_cpu_replicas_sort(dst);
1224
1225         rcu_assign_pointer(c->replicas_gc, dst);
1226         mutex_unlock(&c->sb_lock);
1227
1228         return 0;
1229 }
1230
1231 /* Replicas tracking - superblock: */
1232
1233 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
1234                                         unsigned *nr,
1235                                         unsigned *bytes,
1236                                         unsigned *max_dev)
1237 {
1238         struct bch_replicas_entry *i;
1239         unsigned j;
1240
1241         *nr     = 0;
1242         *bytes  = sizeof(*r);
1243         *max_dev = 0;
1244
1245         if (!r)
1246                 return;
1247
1248         for_each_replicas_entry(r, i) {
1249                 for (j = 0; j < i->nr; j++)
1250                         *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
1251                 (*nr)++;
1252         }
1253
1254         *bytes = (void *) i - (void *) r;
1255 }
1256
1257 static struct bch_replicas_cpu *
1258 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
1259 {
1260         struct bch_replicas_cpu *cpu_r;
1261         unsigned i, nr, bytes, max_dev, entry_size;
1262
1263         bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
1264
1265         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
1266                 DIV_ROUND_UP(max_dev + 1, 8);
1267
1268         cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
1269                         nr * entry_size, GFP_NOIO);
1270         if (!cpu_r)
1271                 return NULL;
1272
1273         cpu_r->nr               = nr;
1274         cpu_r->entry_size       = entry_size;
1275
1276         if (nr) {
1277                 struct bch_replicas_cpu_entry *dst =
1278                         cpu_replicas_entry(cpu_r, 0);
1279                 struct bch_replicas_entry *src = sb_r->entries;
1280
1281                 while (dst < cpu_replicas_entry(cpu_r, nr)) {
1282                         dst->data_type = src->data_type;
1283                         for (i = 0; i < src->nr; i++)
1284                                 replicas_set_dev(dst, src->devs[i]);
1285
1286                         src     = replicas_entry_next(src);
1287                         dst     = (void *) dst + entry_size;
1288                 }
1289         }
1290
1291         bch2_cpu_replicas_sort(cpu_r);
1292         return cpu_r;
1293 }
1294
1295 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
1296 {
1297         struct bch_sb_field_replicas *sb_r;
1298         struct bch_replicas_cpu *cpu_r, *old_r;
1299
1300         sb_r    = bch2_sb_get_replicas(c->disk_sb);
1301         cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1302         if (!cpu_r)
1303                 return -ENOMEM;
1304
1305         old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
1306         rcu_assign_pointer(c->replicas, cpu_r);
1307         if (old_r)
1308                 kfree_rcu(old_r, rcu);
1309
1310         return 0;
1311 }
1312
1313 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
1314                                             struct bch_replicas_cpu *r)
1315 {
1316         struct bch_sb_field_replicas *sb_r;
1317         struct bch_replicas_entry *sb_e;
1318         struct bch_replicas_cpu_entry *e;
1319         size_t i, bytes;
1320
1321         bytes = sizeof(struct bch_sb_field_replicas);
1322
1323         for_each_cpu_replicas_entry(r, e) {
1324                 bytes += sizeof(struct bch_replicas_entry);
1325                 for (i = 0; i < r->entry_size - 1; i++)
1326                         bytes += hweight8(e->devs[i]);
1327         }
1328
1329         sb_r = bch2_fs_sb_resize_replicas(c,
1330                         DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
1331         if (!sb_r)
1332                 return -ENOSPC;
1333
1334         memset(&sb_r->entries, 0,
1335                vstruct_end(&sb_r->field) -
1336                (void *) &sb_r->entries);
1337
1338         sb_e = sb_r->entries;
1339         for_each_cpu_replicas_entry(r, e) {
1340                 sb_e->data_type = e->data_type;
1341
1342                 for (i = 0; i < replicas_dev_slots(r); i++)
1343                         if (replicas_test_dev(e, i))
1344                                 sb_e->devs[sb_e->nr++] = i;
1345
1346                 sb_e = replicas_entry_next(sb_e);
1347
1348                 BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
1349         }
1350
1351         return 0;
1352 }
1353
1354 static const char *bch2_sb_validate_replicas(struct bch_sb *sb,
1355                                              struct bch_sb_field *f)
1356 {
1357         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
1358         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
1359         struct bch_replicas_cpu *cpu_r = NULL;
1360         struct bch_replicas_entry *e;
1361         const char *err;
1362         unsigned i;
1363
1364         for_each_replicas_entry(sb_r, e) {
1365                 err = "invalid replicas entry: invalid data type";
1366                 if (e->data_type >= BCH_DATA_NR)
1367                         goto err;
1368
1369                 err = "invalid replicas entry: no devices";
1370                 if (!e->nr)
1371                         goto err;
1372
1373                 err = "invalid replicas entry: too many devices";
1374                 if (e->nr >= BCH_REPLICAS_MAX)
1375                         goto err;
1376
1377                 err = "invalid replicas entry: invalid device";
1378                 for (i = 0; i < e->nr; i++)
1379                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
1380                                 goto err;
1381         }
1382
1383         err = "cannot allocate memory";
1384         cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1385         if (!cpu_r)
1386                 goto err;
1387
1388         sort_cmp_size(cpu_r->entries,
1389                       cpu_r->nr,
1390                       cpu_r->entry_size,
1391                       memcmp, NULL);
1392
1393         for (i = 0; i + 1 < cpu_r->nr; i++) {
1394                 struct bch_replicas_cpu_entry *l =
1395                         cpu_replicas_entry(cpu_r, i);
1396                 struct bch_replicas_cpu_entry *r =
1397                         cpu_replicas_entry(cpu_r, i + 1);
1398
1399                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
1400
1401                 err = "duplicate replicas entry";
1402                 if (!memcmp(l, r, cpu_r->entry_size))
1403                         goto err;
1404         }
1405
1406         err = NULL;
1407 err:
1408         kfree(cpu_r);
1409         return err;
1410 }
1411
1412 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
1413 {
1414         char *out = buf, *end = out + size;
1415         struct bch_replicas_entry *e;
1416         bool first = true;
1417         unsigned i;
1418
1419         if (!r) {
1420                 out += scnprintf(out, end - out, "(no replicas section found)");
1421                 return out - buf;
1422         }
1423
1424         for_each_replicas_entry(r, e) {
1425                 if (!first)
1426                         out += scnprintf(out, end - out, " ");
1427                 first = false;
1428
1429                 out += scnprintf(out, end - out, "%u: [", e->data_type);
1430
1431                 for (i = 0; i < e->nr; i++)
1432                         out += scnprintf(out, end - out,
1433                                          i ? " %u" : "%u", e->devs[i]);
1434                 out += scnprintf(out, end - out, "]");
1435         }
1436
1437         return out - buf;
1438 }
1439
1440 /* Query replicas: */
1441
1442 bool bch2_replicas_marked(struct bch_fs *c,
1443                           enum bch_data_type data_type,
1444                           struct bch_devs_list devs)
1445 {
1446         struct bch_replicas_cpu_entry search;
1447         unsigned max_dev;
1448         bool ret;
1449
1450         if (!devs.nr)
1451                 return true;
1452
1453         devlist_to_replicas(devs, data_type, &search, &max_dev);
1454
1455         rcu_read_lock();
1456         ret = replicas_has_entry(rcu_dereference(c->replicas),
1457                                  search, max_dev);
1458         rcu_read_unlock();
1459
1460         return ret;
1461 }
1462
1463 bool bch2_bkey_replicas_marked(struct bch_fs *c,
1464                                enum bch_data_type data_type,
1465                                struct bkey_s_c k)
1466 {
1467         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
1468         unsigned i;
1469
1470         for (i = 0; i < cached.nr; i++)
1471                 if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
1472                                           bch2_dev_list_single(cached.devs[i])))
1473                         return false;
1474
1475         return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
1476 }
1477
1478 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
1479                                               struct bch_devs_mask online_devs)
1480 {
1481         struct bch_sb_field_members *mi;
1482         struct bch_replicas_cpu_entry *e;
1483         struct bch_replicas_cpu *r;
1484         unsigned i, dev, dev_slots, nr_online, nr_offline;
1485         struct replicas_status ret;
1486
1487         memset(&ret, 0, sizeof(ret));
1488
1489         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1490                 ret.replicas[i].nr_online = UINT_MAX;
1491
1492         mi = bch2_sb_get_members(c->disk_sb);
1493         rcu_read_lock();
1494
1495         r = rcu_dereference(c->replicas);
1496         dev_slots = replicas_dev_slots(r);
1497
1498         for_each_cpu_replicas_entry(r, e) {
1499                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
1500                         panic("e %p data_type %u\n", e, e->data_type);
1501
1502                 nr_online = nr_offline = 0;
1503
1504                 for (dev = 0; dev < dev_slots; dev++) {
1505                         if (!replicas_test_dev(e, dev))
1506                                 continue;
1507
1508                         BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
1509
1510                         if (test_bit(dev, online_devs.d))
1511                                 nr_online++;
1512                         else
1513                                 nr_offline++;
1514                 }
1515
1516                 ret.replicas[e->data_type].nr_online =
1517                         min(ret.replicas[e->data_type].nr_online,
1518                             nr_online);
1519
1520                 ret.replicas[e->data_type].nr_offline =
1521                         max(ret.replicas[e->data_type].nr_offline,
1522                             nr_offline);
1523         }
1524
1525         rcu_read_unlock();
1526
1527         return ret;
1528 }
1529
1530 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1531 {
1532         return __bch2_replicas_status(c, bch2_online_devs(c));
1533 }
1534
1535 static bool have_enough_devs(struct replicas_status s,
1536                              enum bch_data_type type,
1537                              bool force_if_degraded,
1538                              bool force_if_lost)
1539 {
1540         return (!s.replicas[type].nr_offline || force_if_degraded) &&
1541                 (s.replicas[type].nr_online || force_if_lost);
1542 }
1543
1544 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
1545 {
1546         return (have_enough_devs(s, BCH_DATA_JOURNAL,
1547                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1548                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1549                 have_enough_devs(s, BCH_DATA_BTREE,
1550                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1551                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1552                 have_enough_devs(s, BCH_DATA_USER,
1553                                  flags & BCH_FORCE_IF_DATA_DEGRADED,
1554                                  flags & BCH_FORCE_IF_DATA_LOST));
1555 }
1556
1557 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
1558 {
1559         struct replicas_status s = bch2_replicas_status(c);
1560
1561         return meta
1562                 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
1563                       s.replicas[BCH_DATA_BTREE].nr_online)
1564                 : s.replicas[BCH_DATA_USER].nr_online;
1565 }
1566
1567 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1568 {
1569         struct bch_replicas_cpu_entry *e;
1570         struct bch_replicas_cpu *r;
1571         unsigned ret = 0;
1572
1573         rcu_read_lock();
1574         r = rcu_dereference(c->replicas);
1575
1576         if (ca->dev_idx >= replicas_dev_slots(r))
1577                 goto out;
1578
1579         for_each_cpu_replicas_entry(r, e)
1580                 if (replicas_test_dev(e, ca->dev_idx))
1581                         ret |= 1 << e->data_type;
1582 out:
1583         rcu_read_unlock();
1584
1585         return ret;
1586 }
1587
1588 /* Quotas: */
1589
1590 static const char *bch2_sb_validate_quota(struct bch_sb *sb,
1591                                           struct bch_sb_field *f)
1592 {
1593         struct bch_sb_field_quota *q = field_to_type(f, quota);
1594
1595         if (vstruct_bytes(&q->field) != sizeof(*q))
1596                 return "invalid field quota: wrong size";
1597
1598         return NULL;
1599 }
1600
1601 /* Disk groups: */
1602
1603 #if 0
1604 static size_t trim_nulls(const char *str, size_t len)
1605 {
1606         while (len && !str[len - 1])
1607                 --len;
1608         return len;
1609 }
1610 #endif
1611
1612 static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
1613                                                 struct bch_sb_field *f)
1614 {
1615         struct bch_sb_field_disk_groups *groups =
1616                 field_to_type(f, disk_groups);
1617         struct bch_sb_field_members *mi;
1618         struct bch_member *m;
1619         struct bch_disk_group *g;
1620         unsigned nr_groups;
1621
1622         mi              = bch2_sb_get_members(sb);
1623         groups          = bch2_sb_get_disk_groups(sb);
1624         nr_groups       = disk_groups_nr(groups);
1625
1626         for (m = mi->members;
1627              m < mi->members + sb->nr_devices;
1628              m++) {
1629                 if (!BCH_MEMBER_GROUP(m))
1630                         continue;
1631
1632                 if (BCH_MEMBER_GROUP(m) >= nr_groups)
1633                         return "disk has invalid group";
1634
1635                 g = &groups->entries[BCH_MEMBER_GROUP(m)];
1636                 if (BCH_GROUP_DELETED(g))
1637                         return "disk has invalid group";
1638         }
1639 #if 0
1640         if (!groups)
1641                 return NULL;
1642
1643         char **labels;
1644         labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
1645         if (!labels)
1646                 return "cannot allocate memory";
1647
1648         for (g = groups->groups;
1649              g < groups->groups + nr_groups;
1650              g++) {
1651
1652         }
1653 #endif
1654         return NULL;
1655 }
1656
1657 static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
1658 {
1659         struct bch_sb_field_members *mi;
1660         struct bch_sb_field_disk_groups *groups;
1661         struct bch_disk_groups_cpu *cpu_g, *old_g;
1662         unsigned i, nr_groups;
1663
1664         lockdep_assert_held(&c->sb_lock);
1665
1666         mi              = bch2_sb_get_members(c->disk_sb);
1667         groups          = bch2_sb_get_disk_groups(c->disk_sb);
1668         nr_groups       = disk_groups_nr(groups);
1669
1670         if (!groups)
1671                 return 0;
1672
1673         cpu_g = kzalloc(sizeof(*cpu_g) +
1674                         sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
1675         if (!cpu_g)
1676                 return -ENOMEM;
1677
1678         cpu_g->nr = nr_groups;
1679
1680         for (i = 0; i < nr_groups; i++) {
1681                 struct bch_disk_group *src      = &groups->entries[i];
1682                 struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
1683
1684                 dst->deleted = BCH_GROUP_DELETED(src);
1685         }
1686
1687         for (i = 0; i < c->disk_sb->nr_devices; i++) {
1688                 struct bch_member *m = mi->members + i;
1689                 struct bch_disk_group_cpu *dst =
1690                         &cpu_g->entries[BCH_MEMBER_GROUP(m)];
1691
1692                 if (!bch2_member_exists(m))
1693                         continue;
1694
1695                 __set_bit(i, dst->devs.d);
1696         }
1697
1698         old_g = c->disk_groups;
1699         rcu_assign_pointer(c->disk_groups, cpu_g);
1700         if (old_g)
1701                 kfree_rcu(old_g, rcu);
1702
1703         return 0;
1704 }
1705
1706 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
1707 {
1708         struct target t = target_decode(target);
1709
1710         switch (t.type) {
1711         case TARGET_DEV:
1712                 BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]);
1713                 return &c->devs[t.dev]->self;
1714         case TARGET_GROUP: {
1715                 struct bch_disk_groups_cpu *g =
1716                         rcu_dereference(c->disk_groups);
1717
1718                 /* XXX: what to do here? */
1719                 BUG_ON(t.group >= g->nr || g->entries[t.group].deleted);
1720                 return &g->entries[t.group].devs;
1721         }
1722         default:
1723                 BUG();
1724         }
1725 }