]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/super-io.c
Update bcachefs sources to d5e561b3cc bcachefs: BCH_DATA ioctl
[bcachefs-tools-debian] / libbcachefs / super-io.c
1
2 #include "bcachefs.h"
3 #include "checksum.h"
4 #include "error.h"
5 #include "io.h"
6 #include "super-io.h"
7 #include "super.h"
8 #include "vstructs.h"
9
10 #include <linux/backing-dev.h>
11 #include <linux/sort.h>
12
13 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
14 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
15                                             struct bch_replicas_cpu *);
16 static int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
17
18 /* superblock fields (optional/variable size sections: */
19
20 const char * const bch2_sb_fields[] = {
21 #define x(name, nr)     #name,
22         BCH_SB_FIELDS()
23 #undef x
24         NULL
25 };
26
27 #define x(f, nr)                                        \
28 static const char *bch2_sb_validate_##f(struct bch_sb *, struct bch_sb_field *);
29         BCH_SB_FIELDS()
30 #undef x
31
32 struct bch_sb_field_ops {
33         const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
34 };
35
36 static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
37 #define x(f, nr)                                        \
38         [BCH_SB_FIELD_##f] = {                          \
39                 .validate = bch2_sb_validate_##f,       \
40         },
41         BCH_SB_FIELDS()
42 #undef x
43 };
44
45 static const char *bch2_sb_field_validate(struct bch_sb *sb,
46                                           struct bch_sb_field *f)
47
48 {
49         unsigned type = le32_to_cpu(f->type);
50
51         return type < BCH_SB_FIELD_NR
52                 ? bch2_sb_field_ops[type].validate(sb, f)
53                 : NULL;
54 }
55
56 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
57                                       enum bch_sb_field_type type)
58 {
59         struct bch_sb_field *f;
60
61         /* XXX: need locking around superblock to access optional fields */
62
63         vstruct_for_each(sb, f)
64                 if (le32_to_cpu(f->type) == type)
65                         return f;
66         return NULL;
67 }
68
69 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
70                                                   struct bch_sb_field *f,
71                                                   unsigned u64s)
72 {
73         unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
74
75         if (!f) {
76                 f = vstruct_last(sb);
77                 memset(f, 0, sizeof(u64) * u64s);
78                 f->u64s = cpu_to_le32(u64s);
79                 f->type = 0;
80         } else {
81                 void *src, *dst;
82
83                 src = vstruct_end(f);
84                 f->u64s = cpu_to_le32(u64s);
85                 dst = vstruct_end(f);
86
87                 memmove(dst, src, vstruct_end(sb) - src);
88
89                 if (dst > src)
90                         memset(src, 0, dst - src);
91         }
92
93         le32_add_cpu(&sb->u64s, u64s - old_u64s);
94
95         return f;
96 }
97
98 /* Superblock realloc/free: */
99
100 void bch2_free_super(struct bch_sb_handle *sb)
101 {
102         if (sb->bio)
103                 bio_put(sb->bio);
104         if (!IS_ERR_OR_NULL(sb->bdev))
105                 blkdev_put(sb->bdev, sb->mode);
106
107         free_pages((unsigned long) sb->sb, sb->page_order);
108         memset(sb, 0, sizeof(*sb));
109 }
110
111 static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
112 {
113         struct bch_sb *new_sb;
114         struct bio *bio;
115
116         if (sb->page_order >= order && sb->sb)
117                 return 0;
118
119         if (dynamic_fault("bcachefs:add:super_realloc"))
120                 return -ENOMEM;
121
122         bio = bio_kmalloc(GFP_KERNEL, 1 << order);
123         if (!bio)
124                 return -ENOMEM;
125
126         if (sb->bio)
127                 bio_put(sb->bio);
128         sb->bio = bio;
129
130         new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
131         if (!new_sb)
132                 return -ENOMEM;
133
134         if (sb->sb)
135                 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
136
137         free_pages((unsigned long) sb->sb, sb->page_order);
138         sb->sb = new_sb;
139
140         sb->page_order = order;
141
142         return 0;
143 }
144
145 static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
146 {
147         u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
148         u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
149
150         if (new_bytes > max_bytes) {
151                 char buf[BDEVNAME_SIZE];
152
153                 pr_err("%s: superblock too big: want %llu but have %llu",
154                        bdevname(sb->bdev, buf), new_bytes, max_bytes);
155                 return -ENOSPC;
156         }
157
158         return __bch2_super_realloc(sb, get_order(new_bytes));
159 }
160
161 static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
162 {
163         u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
164         struct bch_sb *sb;
165         unsigned order = get_order(bytes);
166
167         if (c->disk_sb && order <= c->disk_sb_order)
168                 return 0;
169
170         sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
171         if (!sb)
172                 return -ENOMEM;
173
174         if (c->disk_sb)
175                 memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
176
177         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
178
179         c->disk_sb = sb;
180         c->disk_sb_order = order;
181         return 0;
182 }
183
184 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
185                                           enum bch_sb_field_type type,
186                                           unsigned u64s)
187 {
188         struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
189         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
190         ssize_t d = -old_u64s + u64s;
191
192         if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
193                 return NULL;
194
195         f = __bch2_sb_field_resize(sb->sb, f, u64s);
196         f->type = cpu_to_le32(type);
197         return f;
198 }
199
200 struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
201                                             enum bch_sb_field_type type,
202                                             unsigned u64s)
203 {
204         struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
205         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
206         ssize_t d = -old_u64s + u64s;
207         struct bch_dev *ca;
208         unsigned i;
209
210         lockdep_assert_held(&c->sb_lock);
211
212         if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
213                 return NULL;
214
215         /* XXX: we're not checking that offline device have enough space */
216
217         for_each_online_member(ca, c, i) {
218                 struct bch_sb_handle *sb = &ca->disk_sb;
219
220                 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
221                         percpu_ref_put(&ca->ref);
222                         return NULL;
223                 }
224         }
225
226         f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
227         f->type = cpu_to_le32(type);
228         return f;
229 }
230
231 /* Superblock validate: */
232
233 static inline void __bch2_sb_layout_size_assert(void)
234 {
235         BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
236 }
237
238 static const char *validate_sb_layout(struct bch_sb_layout *layout)
239 {
240         u64 offset, prev_offset, max_sectors;
241         unsigned i;
242
243         if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
244                 return "Not a bcachefs superblock layout";
245
246         if (layout->layout_type != 0)
247                 return "Invalid superblock layout type";
248
249         if (!layout->nr_superblocks)
250                 return "Invalid superblock layout: no superblocks";
251
252         if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
253                 return "Invalid superblock layout: too many superblocks";
254
255         max_sectors = 1 << layout->sb_max_size_bits;
256
257         prev_offset = le64_to_cpu(layout->sb_offset[0]);
258
259         for (i = 1; i < layout->nr_superblocks; i++) {
260                 offset = le64_to_cpu(layout->sb_offset[i]);
261
262                 if (offset < prev_offset + max_sectors)
263                         return "Invalid superblock layout: superblocks overlap";
264                 prev_offset = offset;
265         }
266
267         return NULL;
268 }
269
270 const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
271 {
272         struct bch_sb *sb = disk_sb->sb;
273         struct bch_sb_field *f;
274         struct bch_sb_field_members *mi;
275         const char *err;
276         u16 block_size;
277
278         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
279             le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
280                 return"Unsupported superblock version";
281
282         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
283                 SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
284                 SET_BCH_SB_POSIX_ACL(sb, 1);
285         }
286
287         block_size = le16_to_cpu(sb->block_size);
288
289         if (!is_power_of_2(block_size) ||
290             block_size > PAGE_SECTORS)
291                 return "Bad block size";
292
293         if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
294                 return "Bad user UUID";
295
296         if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
297                 return "Bad internal UUID";
298
299         if (!sb->nr_devices ||
300             sb->nr_devices <= sb->dev_idx ||
301             sb->nr_devices > BCH_SB_MEMBERS_MAX)
302                 return "Bad number of member devices";
303
304         if (!BCH_SB_META_REPLICAS_WANT(sb) ||
305             BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
306                 return "Invalid number of metadata replicas";
307
308         if (!BCH_SB_META_REPLICAS_REQ(sb) ||
309             BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
310                 return "Invalid number of metadata replicas";
311
312         if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
313             BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
314                 return "Invalid number of data replicas";
315
316         if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
317             BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
318                 return "Invalid number of data replicas";
319
320         if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
321                 return "Invalid metadata checksum type";
322
323         if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
324                 return "Invalid metadata checksum type";
325
326         if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
327                 return "Invalid compression type";
328
329         if (!BCH_SB_BTREE_NODE_SIZE(sb))
330                 return "Btree node size not set";
331
332         if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
333                 return "Btree node size not a power of two";
334
335         if (BCH_SB_GC_RESERVE(sb) < 5)
336                 return "gc reserve percentage too small";
337
338         if (!sb->time_precision ||
339             le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
340                 return "invalid time precision";
341
342         /* validate layout */
343         err = validate_sb_layout(&sb->layout);
344         if (err)
345                 return err;
346
347         vstruct_for_each(sb, f) {
348                 if (!f->u64s)
349                         return "Invalid superblock: invalid optional field";
350
351                 if (vstruct_next(f) > vstruct_last(sb))
352                         return "Invalid superblock: invalid optional field";
353         }
354
355         /* members must be validated first: */
356         mi = bch2_sb_get_members(sb);
357         if (!mi)
358                 return "Invalid superblock: member info area missing";
359
360         err = bch2_sb_field_validate(sb, &mi->field);
361         if (err)
362                 return err;
363
364         vstruct_for_each(sb, f) {
365                 if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
366                         continue;
367
368                 err = bch2_sb_field_validate(sb, f);
369                 if (err)
370                         return err;
371         }
372
373         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
374             bch2_sb_get_crypt(sb) &&
375             BCH_SB_INITIALIZED(sb))
376                 return "Incompatible extent nonces";
377
378         sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
379
380         return NULL;
381 }
382
383 /* device open: */
384
385 static void bch2_sb_update(struct bch_fs *c)
386 {
387         struct bch_sb *src = c->disk_sb;
388         struct bch_sb_field_members *mi = bch2_sb_get_members(src);
389         struct bch_dev *ca;
390         unsigned i;
391
392         lockdep_assert_held(&c->sb_lock);
393
394         c->sb.uuid              = src->uuid;
395         c->sb.user_uuid         = src->user_uuid;
396         c->sb.nr_devices        = src->nr_devices;
397         c->sb.clean             = BCH_SB_CLEAN(src);
398         c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
399         c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
400         c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
401         c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
402         c->sb.time_precision    = le32_to_cpu(src->time_precision);
403
404         for_each_member_device(ca, c, i)
405                 ca->mi = bch2_mi_to_cpu(mi->members + i);
406 }
407
408 /* doesn't copy member info */
409 static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
410 {
411         struct bch_sb_field *src_f, *dst_f;
412
413         dst->version            = src->version;
414         dst->seq                = src->seq;
415         dst->uuid               = src->uuid;
416         dst->user_uuid          = src->user_uuid;
417         memcpy(dst->label,      src->label, sizeof(dst->label));
418
419         dst->block_size         = src->block_size;
420         dst->nr_devices         = src->nr_devices;
421
422         dst->time_base_lo       = src->time_base_lo;
423         dst->time_base_hi       = src->time_base_hi;
424         dst->time_precision     = src->time_precision;
425
426         memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
427         memcpy(dst->features,   src->features,  sizeof(dst->features));
428         memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
429
430         vstruct_for_each(src, src_f) {
431                 if (src_f->type == BCH_SB_FIELD_journal)
432                         continue;
433
434                 dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
435                 dst_f = __bch2_sb_field_resize(dst, dst_f,
436                                 le32_to_cpu(src_f->u64s));
437
438                 memcpy(dst_f, src_f, vstruct_bytes(src_f));
439         }
440 }
441
442 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
443 {
444         struct bch_sb_field_journal *journal_buckets =
445                 bch2_sb_get_journal(src);
446         unsigned journal_u64s = journal_buckets
447                 ? le32_to_cpu(journal_buckets->field.u64s)
448                 : 0;
449         int ret;
450
451         lockdep_assert_held(&c->sb_lock);
452
453         ret = bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s);
454         if (ret)
455                 return ret;
456
457         __copy_super(c->disk_sb, src);
458
459         ret = bch2_sb_replicas_to_cpu_replicas(c);
460         if (ret)
461                 return ret;
462
463         ret = bch2_sb_disk_groups_to_cpu(c);
464         if (ret)
465                 return ret;
466
467         bch2_sb_update(c);
468         return 0;
469 }
470
471 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
472 {
473         struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
474         struct bch_sb_field_journal *journal_buckets =
475                 bch2_sb_get_journal(dst);
476         unsigned journal_u64s = journal_buckets
477                 ? le32_to_cpu(journal_buckets->field.u64s)
478                 : 0;
479         unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
480         int ret;
481
482         ret = bch2_sb_realloc(&ca->disk_sb, u64s);
483         if (ret)
484                 return ret;
485
486         __copy_super(dst, src);
487         return 0;
488 }
489
490 /* read superblock: */
491
492 static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
493 {
494         struct bch_csum csum;
495         size_t bytes;
496         unsigned order;
497 reread:
498         bio_reset(sb->bio);
499         bio_set_dev(sb->bio, sb->bdev);
500         sb->bio->bi_iter.bi_sector = offset;
501         sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
502         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
503         bch2_bio_map(sb->bio, sb->sb);
504
505         if (submit_bio_wait(sb->bio))
506                 return "IO error";
507
508         if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
509                 return "Not a bcachefs superblock";
510
511         if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
512             le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
513                 return"Unsupported superblock version";
514
515         bytes = vstruct_bytes(sb->sb);
516
517         if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
518                 return "Bad superblock: too big";
519
520         order = get_order(bytes);
521         if (order > sb->page_order) {
522                 if (__bch2_super_realloc(sb, order))
523                         return "cannot allocate memory";
524                 goto reread;
525         }
526
527         if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
528                 return "unknown csum type";
529
530         /* XXX: verify MACs */
531         csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
532                             null_nonce(), sb->sb);
533
534         if (bch2_crc_cmp(csum, sb->sb->csum))
535                 return "bad checksum reading superblock";
536
537         return NULL;
538 }
539
540 int bch2_read_super(const char *path, struct bch_opts *opts,
541                     struct bch_sb_handle *sb)
542 {
543         u64 offset = opt_get(*opts, sb);
544         struct bch_sb_layout layout;
545         const char *err;
546         __le64 *i;
547         int ret;
548
549         memset(sb, 0, sizeof(*sb));
550         sb->mode = FMODE_READ;
551
552         if (!opt_get(*opts, noexcl))
553                 sb->mode |= FMODE_EXCL;
554
555         if (!opt_get(*opts, nochanges))
556                 sb->mode |= FMODE_WRITE;
557
558         sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
559         if (IS_ERR(sb->bdev) &&
560             PTR_ERR(sb->bdev) == -EACCES &&
561             opt_get(*opts, read_only)) {
562                 sb->mode &= ~FMODE_WRITE;
563
564                 sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
565                 if (!IS_ERR(sb->bdev))
566                         opt_set(*opts, nochanges, true);
567         }
568
569         if (IS_ERR(sb->bdev))
570                 return PTR_ERR(sb->bdev);
571
572         err = "cannot allocate memory";
573         ret = __bch2_super_realloc(sb, 0);
574         if (ret)
575                 goto err;
576
577         ret = -EFAULT;
578         err = "dynamic fault";
579         if (bch2_fs_init_fault("read_super"))
580                 goto err;
581
582         ret = -EINVAL;
583         err = read_one_super(sb, offset);
584         if (!err)
585                 goto got_super;
586
587         if (opt_defined(*opts, sb))
588                 goto err;
589
590         pr_err("error reading default superblock: %s", err);
591
592         /*
593          * Error reading primary superblock - read location of backup
594          * superblocks:
595          */
596         bio_reset(sb->bio);
597         bio_set_dev(sb->bio, sb->bdev);
598         sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
599         sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
600         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
601         /*
602          * use sb buffer to read layout, since sb buffer is page aligned but
603          * layout won't be:
604          */
605         bch2_bio_map(sb->bio, sb->sb);
606
607         err = "IO error";
608         if (submit_bio_wait(sb->bio))
609                 goto err;
610
611         memcpy(&layout, sb->sb, sizeof(layout));
612         err = validate_sb_layout(&layout);
613         if (err)
614                 goto err;
615
616         for (i = layout.sb_offset;
617              i < layout.sb_offset + layout.nr_superblocks; i++) {
618                 offset = le64_to_cpu(*i);
619
620                 if (offset == opt_get(*opts, sb))
621                         continue;
622
623                 err = read_one_super(sb, offset);
624                 if (!err)
625                         goto got_super;
626         }
627
628         ret = -EINVAL;
629         goto err;
630
631 got_super:
632         err = "Superblock block size smaller than device block size";
633         ret = -EINVAL;
634         if (le16_to_cpu(sb->sb->block_size) << 9 <
635             bdev_logical_block_size(sb->bdev))
636                 goto err;
637
638         if (sb->mode & FMODE_WRITE)
639                 bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
640                         |= BDI_CAP_STABLE_WRITES;
641
642         return 0;
643 err:
644         bch2_free_super(sb);
645         pr_err("error reading superblock: %s", err);
646         return ret;
647 }
648
649 /* write superblock: */
650
651 static void write_super_endio(struct bio *bio)
652 {
653         struct bch_dev *ca = bio->bi_private;
654
655         /* XXX: return errors directly */
656
657         if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
658                 ca->sb_write_error = 1;
659
660         closure_put(&ca->fs->sb_write);
661         percpu_ref_put(&ca->io_ref);
662 }
663
664 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
665 {
666         struct bch_sb *sb = ca->disk_sb.sb;
667         struct bio *bio = ca->disk_sb.bio;
668
669         sb->offset = sb->layout.sb_offset[idx];
670
671         SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
672         sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
673                                 null_nonce(), sb);
674
675         bio_reset(bio);
676         bio_set_dev(bio, ca->disk_sb.bdev);
677         bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
678         bio->bi_iter.bi_size    =
679                 roundup(vstruct_bytes(sb),
680                         bdev_logical_block_size(ca->disk_sb.bdev));
681         bio->bi_end_io          = write_super_endio;
682         bio->bi_private         = ca;
683         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
684         bch2_bio_map(bio, sb);
685
686         this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
687                      bio_sectors(bio));
688
689         percpu_ref_get(&ca->io_ref);
690         closure_bio_submit(bio, &c->sb_write);
691 }
692
693 void bch2_write_super(struct bch_fs *c)
694 {
695         struct closure *cl = &c->sb_write;
696         struct bch_dev *ca;
697         unsigned i, sb = 0, nr_wrote;
698         const char *err;
699         struct bch_devs_mask sb_written;
700         bool wrote, can_mount_without_written, can_mount_with_written;
701
702         lockdep_assert_held(&c->sb_lock);
703
704         closure_init_stack(cl);
705         memset(&sb_written, 0, sizeof(sb_written));
706
707         le64_add_cpu(&c->disk_sb->seq, 1);
708
709         for_each_online_member(ca, c, i)
710                 bch2_sb_from_fs(c, ca);
711
712         for_each_online_member(ca, c, i) {
713                 err = bch2_sb_validate(&ca->disk_sb);
714                 if (err) {
715                         bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
716                         goto out;
717                 }
718         }
719
720         if (c->opts.nochanges ||
721             test_bit(BCH_FS_ERROR, &c->flags))
722                 goto out;
723
724         for_each_online_member(ca, c, i) {
725                 __set_bit(ca->dev_idx, sb_written.d);
726                 ca->sb_write_error = 0;
727         }
728
729         do {
730                 wrote = false;
731                 for_each_online_member(ca, c, i)
732                         if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
733                                 write_one_super(c, ca, sb);
734                                 wrote = true;
735                         }
736                 closure_sync(cl);
737                 sb++;
738         } while (wrote);
739
740         for_each_online_member(ca, c, i)
741                 if (ca->sb_write_error)
742                         __clear_bit(ca->dev_idx, sb_written.d);
743
744         nr_wrote = dev_mask_nr(&sb_written);
745
746         can_mount_with_written =
747                 bch2_have_enough_devs(c,
748                         __bch2_replicas_status(c, sb_written),
749                         BCH_FORCE_IF_DEGRADED);
750
751         for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
752                 sb_written.d[i] = ~sb_written.d[i];
753
754         can_mount_without_written =
755                 bch2_have_enough_devs(c,
756                         __bch2_replicas_status(c, sb_written),
757                         BCH_FORCE_IF_DEGRADED);
758
759         /*
760          * If we would be able to mount _without_ the devices we successfully
761          * wrote superblocks to, we weren't able to write to enough devices:
762          *
763          * Exception: if we can mount without the successes because we haven't
764          * written anything (new filesystem), we continue if we'd be able to
765          * mount with the devices we did successfully write to:
766          */
767         bch2_fs_fatal_err_on(!nr_wrote ||
768                              (can_mount_without_written &&
769                               !can_mount_with_written), c,
770                 "Unable to write superblock to sufficient devices");
771 out:
772         /* Make new options visible after they're persistent: */
773         bch2_sb_update(c);
774 }
775
776 /* BCH_SB_FIELD_journal: */
777
778 static int u64_cmp(const void *_l, const void *_r)
779 {
780         u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
781
782         return l < r ? -1 : l > r ? 1 : 0;
783 }
784
785 static const char *bch2_sb_validate_journal(struct bch_sb *sb,
786                                             struct bch_sb_field *f)
787 {
788         struct bch_sb_field_journal *journal = field_to_type(f, journal);
789         struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
790         const char *err;
791         unsigned nr;
792         unsigned i;
793         u64 *b;
794
795         journal = bch2_sb_get_journal(sb);
796         if (!journal)
797                 return NULL;
798
799         nr = bch2_nr_journal_buckets(journal);
800         if (!nr)
801                 return NULL;
802
803         b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
804         if (!b)
805                 return "cannot allocate memory";
806
807         for (i = 0; i < nr; i++)
808                 b[i] = le64_to_cpu(journal->buckets[i]);
809
810         sort(b, nr, sizeof(u64), u64_cmp, NULL);
811
812         err = "journal bucket at sector 0";
813         if (!b[0])
814                 goto err;
815
816         err = "journal bucket before first bucket";
817         if (m && b[0] < le16_to_cpu(m->first_bucket))
818                 goto err;
819
820         err = "journal bucket past end of device";
821         if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
822                 goto err;
823
824         err = "duplicate journal buckets";
825         for (i = 0; i + 1 < nr; i++)
826                 if (b[i] == b[i + 1])
827                         goto err;
828
829         err = NULL;
830 err:
831         kfree(b);
832         return err;
833 }
834
835 /* BCH_SB_FIELD_members: */
836
837 static const char *bch2_sb_validate_members(struct bch_sb *sb,
838                                             struct bch_sb_field *f)
839 {
840         struct bch_sb_field_members *mi = field_to_type(f, members);
841         struct bch_member *m;
842
843         if ((void *) (mi->members + sb->nr_devices) >
844             vstruct_end(&mi->field))
845                 return "Invalid superblock: bad member info";
846
847         for (m = mi->members;
848              m < mi->members + sb->nr_devices;
849              m++) {
850                 if (!bch2_member_exists(m))
851                         continue;
852
853                 if (le64_to_cpu(m->nbuckets) > LONG_MAX)
854                         return "Too many buckets";
855
856                 if (le64_to_cpu(m->nbuckets) -
857                     le16_to_cpu(m->first_bucket) < 1 << 10)
858                         return "Not enough buckets";
859
860                 if (le16_to_cpu(m->bucket_size) <
861                     le16_to_cpu(sb->block_size))
862                         return "bucket size smaller than block size";
863
864                 if (le16_to_cpu(m->bucket_size) <
865                     BCH_SB_BTREE_NODE_SIZE(sb))
866                         return "bucket size smaller than btree node size";
867         }
868
869         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
870                 for (m = mi->members;
871                      m < mi->members + sb->nr_devices;
872                      m++)
873                         SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
874
875         return NULL;
876 }
877
878 /* BCH_SB_FIELD_crypt: */
879
880 static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
881                                           struct bch_sb_field *f)
882 {
883         struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
884
885         if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
886                 return "invalid field crypt: wrong size";
887
888         if (BCH_CRYPT_KDF_TYPE(crypt))
889                 return "invalid field crypt: bad kdf type";
890
891         return NULL;
892 }
893
894 /* BCH_SB_FIELD_replicas: */
895
896 /* Replicas tracking - in memory: */
897
898 #define for_each_cpu_replicas_entry(_r, _i)                             \
899         for (_i = (_r)->entries;                                        \
900              (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
901              _i = (void *) (_i) + (_r)->entry_size)
902
903 static inline struct bch_replicas_cpu_entry *
904 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
905 {
906         return (void *) r->entries + r->entry_size * i;
907 }
908
909 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
910 {
911         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
912 }
913
914 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
915                                      unsigned dev)
916 {
917         return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
918 }
919
920 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
921                                     unsigned dev)
922 {
923         e->devs[dev >> 3] |= 1 << (dev & 7);
924 }
925
926 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
927 {
928         return (r->entry_size -
929                 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
930 }
931
932 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
933                               char *buf, size_t size)
934 {
935         char *out = buf, *end = out + size;
936         struct bch_replicas_cpu_entry *e;
937         bool first = true;
938         unsigned i;
939
940         for_each_cpu_replicas_entry(r, e) {
941                 bool first_e = true;
942
943                 if (!first)
944                         out += scnprintf(out, end - out, " ");
945                 first = false;
946
947                 out += scnprintf(out, end - out, "%u: [", e->data_type);
948
949                 for (i = 0; i < replicas_dev_slots(r); i++)
950                         if (replicas_test_dev(e, i)) {
951                                 if (!first_e)
952                                         out += scnprintf(out, end - out, " ");
953                                 first_e = false;
954                                 out += scnprintf(out, end - out, "%u", i);
955                         }
956                 out += scnprintf(out, end - out, "]");
957         }
958
959         return out - buf;
960 }
961
962 static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
963                                         enum bch_data_type data_type,
964                                         struct bch_replicas_cpu_entry *r,
965                                         unsigned *max_dev)
966 {
967         const struct bch_extent_ptr *ptr;
968         unsigned nr = 0;
969
970         BUG_ON(!data_type ||
971                data_type == BCH_DATA_SB ||
972                data_type >= BCH_DATA_NR);
973
974         memset(r, 0, sizeof(*r));
975         r->data_type = data_type;
976
977         *max_dev = 0;
978
979         extent_for_each_ptr(e, ptr)
980                 if (!ptr->cached) {
981                         *max_dev = max_t(unsigned, *max_dev, ptr->dev);
982                         replicas_set_dev(r, ptr->dev);
983                         nr++;
984                 }
985         return nr;
986 }
987
988 static inline void devlist_to_replicas(struct bch_devs_list devs,
989                                        enum bch_data_type data_type,
990                                        struct bch_replicas_cpu_entry *r,
991                                        unsigned *max_dev)
992 {
993         unsigned i;
994
995         BUG_ON(!data_type ||
996                data_type == BCH_DATA_SB ||
997                data_type >= BCH_DATA_NR);
998
999         memset(r, 0, sizeof(*r));
1000         r->data_type = data_type;
1001
1002         *max_dev = 0;
1003
1004         for (i = 0; i < devs.nr; i++) {
1005                 *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
1006                 replicas_set_dev(r, devs.devs[i]);
1007         }
1008 }
1009
1010 static struct bch_replicas_cpu *
1011 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
1012                        struct bch_replicas_cpu_entry new_entry,
1013                        unsigned max_dev)
1014 {
1015         struct bch_replicas_cpu *new;
1016         unsigned i, nr, entry_size;
1017
1018         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
1019                 DIV_ROUND_UP(max_dev + 1, 8);
1020         entry_size = max(entry_size, old->entry_size);
1021         nr = old->nr + 1;
1022
1023         new = kzalloc(sizeof(struct bch_replicas_cpu) +
1024                       nr * entry_size, GFP_NOIO);
1025         if (!new)
1026                 return NULL;
1027
1028         new->nr         = nr;
1029         new->entry_size = entry_size;
1030
1031         for (i = 0; i < old->nr; i++)
1032                 memcpy(cpu_replicas_entry(new, i),
1033                        cpu_replicas_entry(old, i),
1034                        min(new->entry_size, old->entry_size));
1035
1036         memcpy(cpu_replicas_entry(new, old->nr),
1037                &new_entry,
1038                new->entry_size);
1039
1040         bch2_cpu_replicas_sort(new);
1041         return new;
1042 }
1043
1044 static bool replicas_has_entry(struct bch_replicas_cpu *r,
1045                                 struct bch_replicas_cpu_entry search,
1046                                 unsigned max_dev)
1047 {
1048         return max_dev < replicas_dev_slots(r) &&
1049                 eytzinger0_find(r->entries, r->nr,
1050                                 r->entry_size,
1051                                 memcmp, &search) < r->nr;
1052 }
1053
1054 noinline
1055 static int bch2_check_mark_super_slowpath(struct bch_fs *c,
1056                                 struct bch_replicas_cpu_entry new_entry,
1057                                 unsigned max_dev)
1058 {
1059         struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
1060         int ret = -ENOMEM;
1061
1062         mutex_lock(&c->sb_lock);
1063
1064         old_gc = rcu_dereference_protected(c->replicas_gc,
1065                                            lockdep_is_held(&c->sb_lock));
1066         if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
1067                 new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
1068                 if (!new_gc)
1069                         goto err;
1070         }
1071
1072         old_r = rcu_dereference_protected(c->replicas,
1073                                           lockdep_is_held(&c->sb_lock));
1074         if (!replicas_has_entry(old_r, new_entry, max_dev)) {
1075                 new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
1076                 if (!new_r)
1077                         goto err;
1078
1079                 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
1080                 if (ret)
1081                         goto err;
1082         }
1083
1084         /* allocations done, now commit: */
1085
1086         if (new_r)
1087                 bch2_write_super(c);
1088
1089         /* don't update in memory replicas until changes are persistent */
1090
1091         if (new_gc) {
1092                 rcu_assign_pointer(c->replicas_gc, new_gc);
1093                 kfree_rcu(old_gc, rcu);
1094         }
1095
1096         if (new_r) {
1097                 rcu_assign_pointer(c->replicas, new_r);
1098                 kfree_rcu(old_r, rcu);
1099         }
1100
1101         mutex_unlock(&c->sb_lock);
1102         return 0;
1103 err:
1104         mutex_unlock(&c->sb_lock);
1105         if (new_gc)
1106                 kfree(new_gc);
1107         if (new_r)
1108                 kfree(new_r);
1109         return ret;
1110 }
1111
1112 int bch2_check_mark_super(struct bch_fs *c,
1113                           enum bch_data_type data_type,
1114                           struct bch_devs_list devs)
1115 {
1116         struct bch_replicas_cpu_entry search;
1117         struct bch_replicas_cpu *r, *gc_r;
1118         unsigned max_dev;
1119         bool marked;
1120
1121         if (!devs.nr)
1122                 return 0;
1123
1124         devlist_to_replicas(devs, data_type, &search, &max_dev);
1125
1126         rcu_read_lock();
1127         r = rcu_dereference(c->replicas);
1128         gc_r = rcu_dereference(c->replicas_gc);
1129         marked = replicas_has_entry(r, search, max_dev) &&
1130                 (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
1131         rcu_read_unlock();
1132
1133         return likely(marked) ? 0
1134                 : bch2_check_mark_super_slowpath(c, search, max_dev);
1135 }
1136
1137 int bch2_replicas_gc_end(struct bch_fs *c, int err)
1138 {
1139         struct bch_replicas_cpu *new_r, *old_r;
1140         int ret = 0;
1141
1142         lockdep_assert_held(&c->replicas_gc_lock);
1143
1144         mutex_lock(&c->sb_lock);
1145
1146         new_r = rcu_dereference_protected(c->replicas_gc,
1147                                           lockdep_is_held(&c->sb_lock));
1148
1149         if (err) {
1150                 rcu_assign_pointer(c->replicas_gc, NULL);
1151                 kfree_rcu(new_r, rcu);
1152                 goto err;
1153         }
1154
1155         if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
1156                 ret = -ENOSPC;
1157                 goto err;
1158         }
1159
1160         old_r = rcu_dereference_protected(c->replicas,
1161                                           lockdep_is_held(&c->sb_lock));
1162
1163         rcu_assign_pointer(c->replicas, new_r);
1164         rcu_assign_pointer(c->replicas_gc, NULL);
1165         kfree_rcu(old_r, rcu);
1166
1167         bch2_write_super(c);
1168 err:
1169         mutex_unlock(&c->sb_lock);
1170         return ret;
1171 }
1172
1173 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
1174 {
1175         struct bch_replicas_cpu *dst, *src;
1176         struct bch_replicas_cpu_entry *e;
1177
1178         lockdep_assert_held(&c->replicas_gc_lock);
1179
1180         mutex_lock(&c->sb_lock);
1181         BUG_ON(c->replicas_gc);
1182
1183         src = rcu_dereference_protected(c->replicas,
1184                                         lockdep_is_held(&c->sb_lock));
1185
1186         dst = kzalloc(sizeof(struct bch_replicas_cpu) +
1187                       src->nr * src->entry_size, GFP_NOIO);
1188         if (!dst) {
1189                 mutex_unlock(&c->sb_lock);
1190                 return -ENOMEM;
1191         }
1192
1193         dst->nr         = 0;
1194         dst->entry_size = src->entry_size;
1195
1196         for_each_cpu_replicas_entry(src, e)
1197                 if (!((1 << e->data_type) & typemask))
1198                         memcpy(cpu_replicas_entry(dst, dst->nr++),
1199                                e, dst->entry_size);
1200
1201         bch2_cpu_replicas_sort(dst);
1202
1203         rcu_assign_pointer(c->replicas_gc, dst);
1204         mutex_unlock(&c->sb_lock);
1205
1206         return 0;
1207 }
1208
1209 /* Replicas tracking - superblock: */
1210
1211 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
1212                                         unsigned *nr,
1213                                         unsigned *bytes,
1214                                         unsigned *max_dev)
1215 {
1216         struct bch_replicas_entry *i;
1217         unsigned j;
1218
1219         *nr     = 0;
1220         *bytes  = sizeof(*r);
1221         *max_dev = 0;
1222
1223         if (!r)
1224                 return;
1225
1226         for_each_replicas_entry(r, i) {
1227                 for (j = 0; j < i->nr; j++)
1228                         *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
1229                 (*nr)++;
1230         }
1231
1232         *bytes = (void *) i - (void *) r;
1233 }
1234
1235 static struct bch_replicas_cpu *
1236 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
1237 {
1238         struct bch_replicas_cpu *cpu_r;
1239         unsigned i, nr, bytes, max_dev, entry_size;
1240
1241         bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
1242
1243         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
1244                 DIV_ROUND_UP(max_dev + 1, 8);
1245
1246         cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
1247                         nr * entry_size, GFP_NOIO);
1248         if (!cpu_r)
1249                 return NULL;
1250
1251         cpu_r->nr               = nr;
1252         cpu_r->entry_size       = entry_size;
1253
1254         if (nr) {
1255                 struct bch_replicas_cpu_entry *dst =
1256                         cpu_replicas_entry(cpu_r, 0);
1257                 struct bch_replicas_entry *src = sb_r->entries;
1258
1259                 while (dst < cpu_replicas_entry(cpu_r, nr)) {
1260                         dst->data_type = src->data_type;
1261                         for (i = 0; i < src->nr; i++)
1262                                 replicas_set_dev(dst, src->devs[i]);
1263
1264                         src     = replicas_entry_next(src);
1265                         dst     = (void *) dst + entry_size;
1266                 }
1267         }
1268
1269         bch2_cpu_replicas_sort(cpu_r);
1270         return cpu_r;
1271 }
1272
1273 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
1274 {
1275         struct bch_sb_field_replicas *sb_r;
1276         struct bch_replicas_cpu *cpu_r, *old_r;
1277
1278         sb_r    = bch2_sb_get_replicas(c->disk_sb);
1279         cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1280         if (!cpu_r)
1281                 return -ENOMEM;
1282
1283         old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
1284         rcu_assign_pointer(c->replicas, cpu_r);
1285         if (old_r)
1286                 kfree_rcu(old_r, rcu);
1287
1288         return 0;
1289 }
1290
1291 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
1292                                             struct bch_replicas_cpu *r)
1293 {
1294         struct bch_sb_field_replicas *sb_r;
1295         struct bch_replicas_entry *sb_e;
1296         struct bch_replicas_cpu_entry *e;
1297         size_t i, bytes;
1298
1299         bytes = sizeof(struct bch_sb_field_replicas);
1300
1301         for_each_cpu_replicas_entry(r, e) {
1302                 bytes += sizeof(struct bch_replicas_entry);
1303                 for (i = 0; i < r->entry_size - 1; i++)
1304                         bytes += hweight8(e->devs[i]);
1305         }
1306
1307         sb_r = bch2_fs_sb_resize_replicas(c,
1308                         DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
1309         if (!sb_r)
1310                 return -ENOSPC;
1311
1312         memset(&sb_r->entries, 0,
1313                vstruct_end(&sb_r->field) -
1314                (void *) &sb_r->entries);
1315
1316         sb_e = sb_r->entries;
1317         for_each_cpu_replicas_entry(r, e) {
1318                 sb_e->data_type = e->data_type;
1319
1320                 for (i = 0; i < replicas_dev_slots(r); i++)
1321                         if (replicas_test_dev(e, i))
1322                                 sb_e->devs[sb_e->nr++] = i;
1323
1324                 sb_e = replicas_entry_next(sb_e);
1325
1326                 BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
1327         }
1328
1329         return 0;
1330 }
1331
1332 static const char *bch2_sb_validate_replicas(struct bch_sb *sb,
1333                                              struct bch_sb_field *f)
1334 {
1335         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
1336         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
1337         struct bch_replicas_cpu *cpu_r = NULL;
1338         struct bch_replicas_entry *e;
1339         const char *err;
1340         unsigned i;
1341
1342         for_each_replicas_entry(sb_r, e) {
1343                 err = "invalid replicas entry: invalid data type";
1344                 if (e->data_type >= BCH_DATA_NR)
1345                         goto err;
1346
1347                 err = "invalid replicas entry: no devices";
1348                 if (!e->nr)
1349                         goto err;
1350
1351                 err = "invalid replicas entry: too many devices";
1352                 if (e->nr >= BCH_REPLICAS_MAX)
1353                         goto err;
1354
1355                 err = "invalid replicas entry: invalid device";
1356                 for (i = 0; i < e->nr; i++)
1357                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
1358                                 goto err;
1359         }
1360
1361         err = "cannot allocate memory";
1362         cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1363         if (!cpu_r)
1364                 goto err;
1365
1366         sort_cmp_size(cpu_r->entries,
1367                       cpu_r->nr,
1368                       cpu_r->entry_size,
1369                       memcmp, NULL);
1370
1371         for (i = 0; i + 1 < cpu_r->nr; i++) {
1372                 struct bch_replicas_cpu_entry *l =
1373                         cpu_replicas_entry(cpu_r, i);
1374                 struct bch_replicas_cpu_entry *r =
1375                         cpu_replicas_entry(cpu_r, i + 1);
1376
1377                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
1378
1379                 err = "duplicate replicas entry";
1380                 if (!memcmp(l, r, cpu_r->entry_size))
1381                         goto err;
1382         }
1383
1384         err = NULL;
1385 err:
1386         kfree(cpu_r);
1387         return err;
1388 }
1389
1390 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
1391 {
1392         char *out = buf, *end = out + size;
1393         struct bch_replicas_entry *e;
1394         bool first = true;
1395         unsigned i;
1396
1397         if (!r) {
1398                 out += scnprintf(out, end - out, "(no replicas section found)");
1399                 return out - buf;
1400         }
1401
1402         for_each_replicas_entry(r, e) {
1403                 if (!first)
1404                         out += scnprintf(out, end - out, " ");
1405                 first = false;
1406
1407                 out += scnprintf(out, end - out, "%u: [", e->data_type);
1408
1409                 for (i = 0; i < e->nr; i++)
1410                         out += scnprintf(out, end - out,
1411                                          i ? " %u" : "%u", e->devs[i]);
1412                 out += scnprintf(out, end - out, "]");
1413         }
1414
1415         return out - buf;
1416 }
1417
1418 /* Query replicas: */
1419
1420 bool bch2_sb_has_replicas(struct bch_fs *c,
1421                           enum bch_data_type data_type,
1422                           struct bch_devs_list devs)
1423 {
1424         struct bch_replicas_cpu_entry search;
1425         unsigned max_dev;
1426         bool ret;
1427
1428         if (!devs.nr)
1429                 return true;
1430
1431         devlist_to_replicas(devs, data_type, &search, &max_dev);
1432
1433         rcu_read_lock();
1434         ret = replicas_has_entry(rcu_dereference(c->replicas),
1435                                  search, max_dev);
1436         rcu_read_unlock();
1437
1438         return ret;
1439 }
1440
1441 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
1442                                               struct bch_devs_mask online_devs)
1443 {
1444         struct bch_sb_field_members *mi;
1445         struct bch_replicas_cpu_entry *e;
1446         struct bch_replicas_cpu *r;
1447         unsigned i, dev, dev_slots, nr_online, nr_offline;
1448         struct replicas_status ret;
1449
1450         memset(&ret, 0, sizeof(ret));
1451
1452         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1453                 ret.replicas[i].nr_online = UINT_MAX;
1454
1455         mi = bch2_sb_get_members(c->disk_sb);
1456         rcu_read_lock();
1457
1458         r = rcu_dereference(c->replicas);
1459         dev_slots = replicas_dev_slots(r);
1460
1461         for_each_cpu_replicas_entry(r, e) {
1462                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
1463                         panic("e %p data_type %u\n", e, e->data_type);
1464
1465                 nr_online = nr_offline = 0;
1466
1467                 for (dev = 0; dev < dev_slots; dev++) {
1468                         if (!replicas_test_dev(e, dev))
1469                                 continue;
1470
1471                         BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
1472
1473                         if (test_bit(dev, online_devs.d))
1474                                 nr_online++;
1475                         else
1476                                 nr_offline++;
1477                 }
1478
1479                 ret.replicas[e->data_type].nr_online =
1480                         min(ret.replicas[e->data_type].nr_online,
1481                             nr_online);
1482
1483                 ret.replicas[e->data_type].nr_offline =
1484                         max(ret.replicas[e->data_type].nr_offline,
1485                             nr_offline);
1486         }
1487
1488         rcu_read_unlock();
1489
1490         return ret;
1491 }
1492
1493 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1494 {
1495         return __bch2_replicas_status(c, bch2_online_devs(c));
1496 }
1497
1498 bool bch2_have_enough_devs(struct bch_fs *c,
1499                            struct replicas_status s,
1500                            unsigned flags)
1501 {
1502         if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
1503              s.replicas[BCH_DATA_BTREE].nr_offline) &&
1504             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1505                 return false;
1506
1507         if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
1508              !s.replicas[BCH_DATA_BTREE].nr_online) &&
1509             !(flags & BCH_FORCE_IF_METADATA_LOST))
1510                 return false;
1511
1512         if (s.replicas[BCH_DATA_USER].nr_offline &&
1513             !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1514                 return false;
1515
1516         if (!s.replicas[BCH_DATA_USER].nr_online &&
1517             !(flags & BCH_FORCE_IF_DATA_LOST))
1518                 return false;
1519
1520         return true;
1521 }
1522
1523 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
1524 {
1525         struct replicas_status s = bch2_replicas_status(c);
1526
1527         return meta
1528                 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
1529                       s.replicas[BCH_DATA_BTREE].nr_online)
1530                 : s.replicas[BCH_DATA_USER].nr_online;
1531 }
1532
1533 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1534 {
1535         struct bch_replicas_cpu_entry *e;
1536         struct bch_replicas_cpu *r;
1537         unsigned ret = 0;
1538
1539         rcu_read_lock();
1540         r = rcu_dereference(c->replicas);
1541
1542         if (ca->dev_idx >= replicas_dev_slots(r))
1543                 goto out;
1544
1545         for_each_cpu_replicas_entry(r, e)
1546                 if (replicas_test_dev(e, ca->dev_idx))
1547                         ret |= 1 << e->data_type;
1548 out:
1549         rcu_read_unlock();
1550
1551         return ret;
1552 }
1553
1554 /* Quotas: */
1555
1556 static const char *bch2_sb_validate_quota(struct bch_sb *sb,
1557                                           struct bch_sb_field *f)
1558 {
1559         struct bch_sb_field_quota *q = field_to_type(f, quota);
1560
1561         if (vstruct_bytes(&q->field) != sizeof(*q))
1562                 return "invalid field quota: wrong size";
1563
1564         return NULL;
1565 }
1566
1567 /* Disk groups: */
1568
1569 #if 0
1570 static size_t trim_nulls(const char *str, size_t len)
1571 {
1572         while (len && !str[len - 1])
1573                 --len;
1574         return len;
1575 }
1576 #endif
1577
1578 static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
1579                                                 struct bch_sb_field *f)
1580 {
1581         struct bch_sb_field_disk_groups *groups =
1582                 field_to_type(f, disk_groups);
1583         struct bch_sb_field_members *mi;
1584         struct bch_member *m;
1585         struct bch_disk_group *g;
1586         unsigned nr_groups;
1587
1588         mi              = bch2_sb_get_members(sb);
1589         groups          = bch2_sb_get_disk_groups(sb);
1590         nr_groups       = disk_groups_nr(groups);
1591
1592         for (m = mi->members;
1593              m < mi->members + sb->nr_devices;
1594              m++) {
1595                 if (!BCH_MEMBER_GROUP(m))
1596                         continue;
1597
1598                 if (BCH_MEMBER_GROUP(m) >= nr_groups)
1599                         return "disk has invalid group";
1600
1601                 g = &groups->entries[BCH_MEMBER_GROUP(m)];
1602                 if (BCH_GROUP_DELETED(g))
1603                         return "disk has invalid group";
1604         }
1605 #if 0
1606         if (!groups)
1607                 return NULL;
1608
1609         char **labels;
1610         labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
1611         if (!labels)
1612                 return "cannot allocate memory";
1613
1614         for (g = groups->groups;
1615              g < groups->groups + nr_groups;
1616              g++) {
1617
1618         }
1619 #endif
1620         return NULL;
1621 }
1622
1623 static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
1624 {
1625         struct bch_sb_field_members *mi;
1626         struct bch_sb_field_disk_groups *groups;
1627         struct bch_disk_groups_cpu *cpu_g, *old_g;
1628         unsigned i, nr_groups;
1629
1630         lockdep_assert_held(&c->sb_lock);
1631
1632         mi              = bch2_sb_get_members(c->disk_sb);
1633         groups          = bch2_sb_get_disk_groups(c->disk_sb);
1634         nr_groups       = disk_groups_nr(groups);
1635
1636         if (!groups)
1637                 return 0;
1638
1639         cpu_g = kzalloc(sizeof(*cpu_g) +
1640                         sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
1641         if (!cpu_g)
1642                 return -ENOMEM;
1643
1644         cpu_g->nr = nr_groups;
1645
1646         for (i = 0; i < nr_groups; i++) {
1647                 struct bch_disk_group *src      = &groups->entries[i];
1648                 struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
1649
1650                 dst->deleted = BCH_GROUP_DELETED(src);
1651         }
1652
1653         for (i = 0; i < c->disk_sb->nr_devices; i++) {
1654                 struct bch_member *m = mi->members + i;
1655                 struct bch_disk_group_cpu *dst =
1656                         &cpu_g->entries[BCH_MEMBER_GROUP(m)];
1657
1658                 if (!bch2_member_exists(m))
1659                         continue;
1660
1661                 __set_bit(i, dst->devs.d);
1662         }
1663
1664         old_g = c->disk_groups;
1665         rcu_assign_pointer(c->disk_groups, cpu_g);
1666         if (old_g)
1667                 kfree_rcu(old_g, rcu);
1668
1669         return 0;
1670 }
1671
1672 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
1673 {
1674         struct target t = target_decode(target);
1675
1676         switch (t.type) {
1677         case TARGET_DEV:
1678                 BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]);
1679                 return &c->devs[t.dev]->self;
1680         case TARGET_GROUP: {
1681                 struct bch_disk_groups_cpu *g =
1682                         rcu_dereference(c->disk_groups);
1683
1684                 /* XXX: what to do here? */
1685                 BUG_ON(t.group >= g->nr || g->entries[t.group].deleted);
1686                 return &g->entries[t.group].devs;
1687         }
1688         default:
1689                 BUG();
1690         }
1691 }