11 #include <linux/backing-dev.h>
12 #include <linux/sort.h>
14 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
15 static const char *bch2_sb_validate_replicas(struct bch_sb *);
17 static inline void __bch2_sb_layout_size_assert(void)
19 BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
22 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
23 enum bch_sb_field_type type)
25 struct bch_sb_field *f;
27 /* XXX: need locking around superblock to access optional fields */
29 vstruct_for_each(sb, f)
30 if (le32_to_cpu(f->type) == type)
35 void bch2_free_super(struct bcache_superblock *sb)
39 if (!IS_ERR_OR_NULL(sb->bdev))
40 blkdev_put(sb->bdev, sb->mode);
42 free_pages((unsigned long) sb->sb, sb->page_order);
43 memset(sb, 0, sizeof(*sb));
46 static int __bch2_super_realloc(struct bcache_superblock *sb, unsigned order)
48 struct bch_sb *new_sb;
51 if (sb->page_order >= order && sb->sb)
54 if (dynamic_fault("bcachefs:add:super_realloc"))
57 bio = bio_kmalloc(GFP_KERNEL, 1 << order);
65 new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
70 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
72 free_pages((unsigned long) sb->sb, sb->page_order);
75 sb->page_order = order;
80 static int bch2_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
82 u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
83 u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
85 if (new_bytes > max_bytes) {
86 char buf[BDEVNAME_SIZE];
88 pr_err("%s: superblock too big: want %llu but have %llu",
89 bdevname(sb->bdev, buf), new_bytes, max_bytes);
93 return __bch2_super_realloc(sb, get_order(new_bytes));
96 static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
98 u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
100 unsigned order = get_order(bytes);
102 if (c->disk_sb && order <= c->disk_sb_order)
105 sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
110 memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
112 free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
115 c->disk_sb_order = order;
119 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
120 struct bch_sb_field *f,
123 unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
126 f = vstruct_last(sb);
127 memset(f, 0, sizeof(u64) * u64s);
128 f->u64s = cpu_to_le32(u64s);
133 src = vstruct_end(f);
134 f->u64s = cpu_to_le32(u64s);
135 dst = vstruct_end(f);
137 memmove(dst, src, vstruct_end(sb) - src);
140 memset(src, 0, dst - src);
143 le32_add_cpu(&sb->u64s, u64s - old_u64s);
148 struct bch_sb_field *bch2_sb_field_resize(struct bcache_superblock *sb,
149 enum bch_sb_field_type type,
152 struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
153 ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
154 ssize_t d = -old_u64s + u64s;
156 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
159 f = __bch2_sb_field_resize(sb->sb, f, u64s);
164 struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
165 enum bch_sb_field_type type,
168 struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
169 ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
170 ssize_t d = -old_u64s + u64s;
174 lockdep_assert_held(&c->sb_lock);
176 if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
179 /* XXX: we're not checking that offline device have enough space */
181 for_each_online_member(ca, c, i) {
182 struct bcache_superblock *sb = &ca->disk_sb;
184 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
185 percpu_ref_put(&ca->ref);
190 f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
195 static const char *validate_sb_layout(struct bch_sb_layout *layout)
197 u64 offset, prev_offset, max_sectors;
200 if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
201 return "Not a bcachefs superblock layout";
203 if (layout->layout_type != 0)
204 return "Invalid superblock layout type";
206 if (!layout->nr_superblocks)
207 return "Invalid superblock layout: no superblocks";
209 if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
210 return "Invalid superblock layout: too many superblocks";
212 max_sectors = 1 << layout->sb_max_size_bits;
214 prev_offset = le64_to_cpu(layout->sb_offset[0]);
216 for (i = 1; i < layout->nr_superblocks; i++) {
217 offset = le64_to_cpu(layout->sb_offset[i]);
219 if (offset < prev_offset + max_sectors)
220 return "Invalid superblock layout: superblocks overlap";
221 prev_offset = offset;
227 static int u64_cmp(const void *_l, const void *_r)
229 u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
231 return l < r ? -1 : l > r ? 1 : 0;
234 const char *bch2_sb_validate_journal(struct bch_sb *sb,
235 struct bch_member_cpu mi)
237 struct bch_sb_field_journal *journal;
243 journal = bch2_sb_get_journal(sb);
247 nr = bch2_nr_journal_buckets(journal);
251 b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
253 return "cannot allocate memory";
255 for (i = 0; i < nr; i++)
256 b[i] = le64_to_cpu(journal->buckets[i]);
258 sort(b, nr, sizeof(u64), u64_cmp, NULL);
260 err = "journal bucket at sector 0";
264 err = "journal bucket before first bucket";
265 if (b[0] < mi.first_bucket)
268 err = "journal bucket past end of device";
269 if (b[nr - 1] >= mi.nbuckets)
272 err = "duplicate journal buckets";
273 for (i = 0; i + 1 < nr; i++)
274 if (b[i] == b[i + 1])
283 static const char *bch2_sb_validate_members(struct bch_sb *sb)
285 struct bch_sb_field_members *mi;
288 mi = bch2_sb_get_members(sb);
290 return "Invalid superblock: member info area missing";
292 if ((void *) (mi->members + sb->nr_devices) >
293 vstruct_end(&mi->field))
294 return "Invalid superblock: bad member info";
296 for (i = 0; i < sb->nr_devices; i++) {
297 if (!bch2_dev_exists(sb, mi, i))
300 if (le16_to_cpu(mi->members[i].bucket_size) <
301 BCH_SB_BTREE_NODE_SIZE(sb))
302 return "bucket size smaller than btree node size";
308 const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
310 struct bch_sb *sb = disk_sb->sb;
311 struct bch_sb_field *f;
312 struct bch_sb_field_members *sb_mi;
313 struct bch_member_cpu mi;
317 if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
318 le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
319 return"Unsupported superblock version";
321 if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
322 SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
324 block_size = le16_to_cpu(sb->block_size);
326 if (!is_power_of_2(block_size) ||
327 block_size > PAGE_SECTORS)
328 return "Bad block size";
330 if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
331 return "Bad user UUID";
333 if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
334 return "Bad internal UUID";
336 if (!sb->nr_devices ||
337 sb->nr_devices <= sb->dev_idx ||
338 sb->nr_devices > BCH_SB_MEMBERS_MAX)
339 return "Bad cache device number in set";
341 if (!BCH_SB_META_REPLICAS_WANT(sb) ||
342 BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
343 return "Invalid number of metadata replicas";
345 if (!BCH_SB_META_REPLICAS_REQ(sb) ||
346 BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
347 return "Invalid number of metadata replicas";
349 if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
350 BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
351 return "Invalid number of data replicas";
353 if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
354 BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
355 return "Invalid number of metadata replicas";
357 if (!BCH_SB_BTREE_NODE_SIZE(sb))
358 return "Btree node size not set";
360 if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
361 return "Btree node size not a power of two";
363 if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
364 return "Btree node size too large";
366 if (BCH_SB_GC_RESERVE(sb) < 5)
367 return "gc reserve percentage too small";
369 if (!sb->time_precision ||
370 le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
371 return "invalid time precision";
373 /* validate layout */
374 err = validate_sb_layout(&sb->layout);
378 vstruct_for_each(sb, f) {
380 return "Invalid superblock: invalid optional field";
382 if (vstruct_next(f) > vstruct_last(sb))
383 return "Invalid superblock: invalid optional field";
385 if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR)
386 return "Invalid superblock: unknown optional field type";
389 err = bch2_sb_validate_members(sb);
393 sb_mi = bch2_sb_get_members(sb);
394 mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx);
396 if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
397 struct bch_member *m;
399 for (m = sb_mi->members;
400 m < sb_mi->members + sb->nr_devices;
402 SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
405 if (mi.nbuckets > LONG_MAX)
406 return "Too many buckets";
408 if (mi.nbuckets - mi.first_bucket < 1 << 10)
409 return "Not enough buckets";
411 if (mi.bucket_size < block_size)
412 return "Bad bucket size";
414 if (get_capacity(disk_sb->bdev->bd_disk) <
415 mi.bucket_size * mi.nbuckets)
416 return "Invalid superblock: device too small";
418 err = bch2_sb_validate_journal(sb, mi);
422 err = bch2_sb_validate_replicas(sb);
426 sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
433 static const char *bch2_blkdev_open(const char *path, fmode_t mode,
434 void *holder, struct block_device **ret)
436 struct block_device *bdev;
439 bdev = blkdev_get_by_path(path, mode, holder);
440 if (bdev == ERR_PTR(-EBUSY))
441 return "device busy";
444 return "failed to open device";
446 if (mode & FMODE_WRITE)
447 bdev_get_queue(bdev)->backing_dev_info->capabilities
448 |= BDI_CAP_STABLE_WRITES;
454 static void bch2_sb_update(struct bch_fs *c)
456 struct bch_sb *src = c->disk_sb;
457 struct bch_sb_field_members *mi = bch2_sb_get_members(src);
461 lockdep_assert_held(&c->sb_lock);
463 c->sb.uuid = src->uuid;
464 c->sb.user_uuid = src->user_uuid;
465 c->sb.block_size = le16_to_cpu(src->block_size);
466 c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src);
467 c->sb.nr_devices = src->nr_devices;
468 c->sb.clean = BCH_SB_CLEAN(src);
469 c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src);
470 c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
471 c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
472 c->sb.time_base_lo = le64_to_cpu(src->time_base_lo);
473 c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
474 c->sb.time_precision = le32_to_cpu(src->time_precision);
476 for_each_member_device(ca, c, i)
477 ca->mi = bch2_mi_to_cpu(mi->members + i);
480 /* doesn't copy member info */
481 static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
483 struct bch_sb_field *src_f, *dst_f;
485 dst->version = src->version;
487 dst->uuid = src->uuid;
488 dst->user_uuid = src->user_uuid;
489 memcpy(dst->label, src->label, sizeof(dst->label));
491 dst->block_size = src->block_size;
492 dst->nr_devices = src->nr_devices;
494 dst->time_base_lo = src->time_base_lo;
495 dst->time_base_hi = src->time_base_hi;
496 dst->time_precision = src->time_precision;
498 memcpy(dst->flags, src->flags, sizeof(dst->flags));
499 memcpy(dst->features, src->features, sizeof(dst->features));
500 memcpy(dst->compat, src->compat, sizeof(dst->compat));
502 vstruct_for_each(src, src_f) {
503 if (src_f->type == BCH_SB_FIELD_journal)
506 dst_f = bch2_sb_field_get(dst, src_f->type);
507 dst_f = __bch2_sb_field_resize(dst, dst_f,
508 le32_to_cpu(src_f->u64s));
510 memcpy(dst_f, src_f, vstruct_bytes(src_f));
514 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
516 struct bch_sb_field_journal *journal_buckets =
517 bch2_sb_get_journal(src);
518 unsigned journal_u64s = journal_buckets
519 ? le32_to_cpu(journal_buckets->field.u64s)
523 lockdep_assert_held(&c->sb_lock);
525 if (bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
528 __copy_super(c->disk_sb, src);
530 ret = bch2_sb_replicas_to_cpu_replicas(c);
538 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
540 struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
541 struct bch_sb_field_journal *journal_buckets =
542 bch2_sb_get_journal(dst);
543 unsigned journal_u64s = journal_buckets
544 ? le32_to_cpu(journal_buckets->field.u64s)
546 unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
549 ret = bch2_sb_realloc(&ca->disk_sb, u64s);
553 __copy_super(dst, src);
558 /* read superblock: */
560 static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
562 struct bch_csum csum;
567 sb->bio->bi_bdev = sb->bdev;
568 sb->bio->bi_iter.bi_sector = offset;
569 sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
570 bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
571 bch2_bio_map(sb->bio, sb->sb);
573 if (submit_bio_wait(sb->bio))
576 if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
577 return "Not a bcachefs superblock";
579 if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
580 le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
581 return"Unsupported superblock version";
583 bytes = vstruct_bytes(sb->sb);
585 if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
586 return "Bad superblock: too big";
588 order = get_order(bytes);
589 if (order > sb->page_order) {
590 if (__bch2_super_realloc(sb, order))
591 return "cannot allocate memory";
595 if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
596 return "unknown csum type";
598 /* XXX: verify MACs */
599 csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
600 (struct nonce) { 0 }, sb->sb);
602 if (bch2_crc_cmp(csum, sb->sb->csum))
603 return "bad checksum reading superblock";
608 const char *bch2_read_super(struct bcache_superblock *sb,
609 struct bch_opts opts,
612 u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
613 struct bch_sb_layout layout;
617 memset(sb, 0, sizeof(*sb));
618 sb->mode = FMODE_READ;
620 if (!(opt_defined(opts.noexcl) && opts.noexcl))
621 sb->mode |= FMODE_EXCL;
623 if (!(opt_defined(opts.nochanges) && opts.nochanges))
624 sb->mode |= FMODE_WRITE;
626 err = bch2_blkdev_open(path, sb->mode, sb, &sb->bdev);
630 err = "cannot allocate memory";
631 if (__bch2_super_realloc(sb, 0))
634 err = "dynamic fault";
635 if (bch2_fs_init_fault("read_super"))
638 err = read_one_super(sb, offset);
642 if (offset != BCH_SB_SECTOR) {
643 pr_err("error reading superblock: %s", err);
647 pr_err("error reading default superblock: %s", err);
650 * Error reading primary superblock - read location of backup
654 sb->bio->bi_bdev = sb->bdev;
655 sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
656 sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
657 bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
659 * use sb buffer to read layout, since sb buffer is page aligned but
662 bch2_bio_map(sb->bio, sb->sb);
665 if (submit_bio_wait(sb->bio))
668 memcpy(&layout, sb->sb, sizeof(layout));
669 err = validate_sb_layout(&layout);
673 for (i = 0; i < layout.nr_superblocks; i++) {
674 u64 offset = le64_to_cpu(layout.sb_offset[i]);
676 if (offset == BCH_SB_SECTOR)
679 err = read_one_super(sb, offset);
685 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
686 le64_to_cpu(sb->sb->version),
687 le64_to_cpu(sb->sb->flags),
688 le64_to_cpu(sb->sb->seq),
689 le16_to_cpu(sb->sb->u64s));
691 err = "Superblock block size smaller than device block size";
692 if (le16_to_cpu(sb->sb->block_size) << 9 <
693 bdev_logical_block_size(sb->bdev))
702 /* write superblock: */
704 static void write_super_endio(struct bio *bio)
706 struct bch_dev *ca = bio->bi_private;
708 /* XXX: return errors directly */
710 if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
711 ca->sb_write_error = 1;
713 closure_put(&ca->fs->sb_write);
714 percpu_ref_put(&ca->io_ref);
717 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
719 struct bch_sb *sb = ca->disk_sb.sb;
720 struct bio *bio = ca->disk_sb.bio;
722 sb->offset = sb->layout.sb_offset[idx];
724 SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
725 sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
726 (struct nonce) { 0 }, sb);
729 bio->bi_bdev = ca->disk_sb.bdev;
730 bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
731 bio->bi_iter.bi_size =
732 roundup(vstruct_bytes(sb),
733 bdev_logical_block_size(ca->disk_sb.bdev));
734 bio->bi_end_io = write_super_endio;
735 bio->bi_private = ca;
736 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
737 bch2_bio_map(bio, sb);
739 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
742 percpu_ref_get(&ca->io_ref);
743 closure_bio_submit(bio, &c->sb_write);
746 void bch2_write_super(struct bch_fs *c)
748 struct closure *cl = &c->sb_write;
750 unsigned i, sb = 0, nr_wrote;
752 struct bch_devs_mask sb_written;
753 bool wrote, can_mount_without_written, can_mount_with_written;
755 lockdep_assert_held(&c->sb_lock);
757 closure_init_stack(cl);
758 memset(&sb_written, 0, sizeof(sb_written));
760 le64_add_cpu(&c->disk_sb->seq, 1);
762 for_each_online_member(ca, c, i)
763 bch2_sb_from_fs(c, ca);
765 for_each_online_member(ca, c, i) {
766 err = bch2_sb_validate(&ca->disk_sb);
768 bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
773 if (c->opts.nochanges ||
774 test_bit(BCH_FS_ERROR, &c->flags))
777 for_each_online_member(ca, c, i) {
778 __set_bit(ca->dev_idx, sb_written.d);
779 ca->sb_write_error = 0;
784 for_each_online_member(ca, c, i)
785 if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
786 write_one_super(c, ca, sb);
793 for_each_online_member(ca, c, i)
794 if (ca->sb_write_error)
795 __clear_bit(ca->dev_idx, sb_written.d);
797 nr_wrote = dev_mask_nr(&sb_written);
799 can_mount_with_written =
800 bch2_have_enough_devs(c,
801 __bch2_replicas_status(c, sb_written),
802 BCH_FORCE_IF_DEGRADED);
804 for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
805 sb_written.d[i] = ~sb_written.d[i];
807 can_mount_without_written =
808 bch2_have_enough_devs(c,
809 __bch2_replicas_status(c, sb_written),
810 BCH_FORCE_IF_DEGRADED);
813 * If we would be able to mount _without_ the devices we successfully
814 * wrote superblocks to, we weren't able to write to enough devices:
816 * Exception: if we can mount without the successes because we haven't
817 * written anything (new filesystem), we continue if we'd be able to
818 * mount with the devices we did successfully write to:
820 bch2_fs_fatal_err_on(!nr_wrote ||
821 (can_mount_without_written &&
822 !can_mount_with_written), c,
823 "Unable to write superblock to sufficient devices");
825 /* Make new options visible after they're persistent: */
829 /* replica information: */
831 static inline struct bch_replicas_cpu_entry *
832 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
834 return (void *) r->entries + r->entry_size * i;
837 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
840 return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
843 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
846 e->devs[dev >> 3] |= 1 << (dev & 7);
849 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
851 return (r->entry_size -
852 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
855 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
860 struct bch_replicas_entry *i;
870 for_each_replicas_entry(r, i) {
871 for (j = 0; j < i->nr; j++)
872 *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
876 *bytes = (void *) i - (void *) r;
879 static struct bch_replicas_cpu *
880 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
882 struct bch_replicas_cpu *cpu_r;
883 unsigned i, nr, bytes, max_dev, entry_size;
885 bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
887 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
888 DIV_ROUND_UP(max_dev + 1, 8);
890 cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
891 nr * entry_size, GFP_NOIO);
896 cpu_r->entry_size = entry_size;
899 struct bch_replicas_cpu_entry *dst =
900 cpu_replicas_entry(cpu_r, 0);
901 struct bch_replicas_entry *src = sb_r->entries;
903 while (dst < cpu_replicas_entry(cpu_r, nr)) {
904 dst->data_type = src->data_type;
905 for (i = 0; i < src->nr; i++)
906 replicas_set_dev(dst, src->devs[i]);
908 src = replicas_entry_next(src);
909 dst = (void *) dst + entry_size;
913 eytzinger0_sort(cpu_r->entries,
920 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
922 struct bch_sb_field_replicas *sb_r;
923 struct bch_replicas_cpu *cpu_r, *old_r;
925 lockdep_assert_held(&c->sb_lock);
927 sb_r = bch2_sb_get_replicas(c->disk_sb);
928 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
933 rcu_assign_pointer(c->replicas, cpu_r);
935 kfree_rcu(old_r, rcu);
940 static void bkey_to_replicas(struct bkey_s_c_extent e,
941 enum bch_data_type data_type,
942 struct bch_replicas_cpu_entry *r,
945 const struct bch_extent_ptr *ptr;
948 data_type == BCH_DATA_SB ||
949 data_type >= BCH_DATA_NR);
951 memset(r, 0, sizeof(*r));
952 r->data_type = data_type;
956 extent_for_each_ptr(e, ptr)
958 *max_dev = max_t(unsigned, *max_dev, ptr->dev);
959 replicas_set_dev(r, ptr->dev);
964 * for when gc of replica information is in progress:
966 static int bch2_update_gc_replicas(struct bch_fs *c,
967 struct bch_replicas_cpu *gc_r,
968 struct bkey_s_c_extent e,
969 enum bch_data_type data_type)
971 struct bch_replicas_cpu_entry new_e;
972 struct bch_replicas_cpu *new;
973 unsigned i, nr, entry_size, max_dev;
975 bkey_to_replicas(e, data_type, &new_e, &max_dev);
977 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
978 DIV_ROUND_UP(max_dev + 1, 8);
979 entry_size = max(entry_size, gc_r->entry_size);
982 new = kzalloc(sizeof(struct bch_replicas_cpu) +
983 nr * entry_size, GFP_NOIO);
988 new->entry_size = entry_size;
990 for (i = 0; i < gc_r->nr; i++)
991 memcpy(cpu_replicas_entry(new, i),
992 cpu_replicas_entry(gc_r, i),
995 memcpy(cpu_replicas_entry(new, nr - 1),
999 eytzinger0_sort(new->entries,
1004 rcu_assign_pointer(c->replicas_gc, new);
1005 kfree_rcu(gc_r, rcu);
1009 static bool replicas_has_extent(struct bch_replicas_cpu *r,
1010 struct bkey_s_c_extent e,
1011 enum bch_data_type data_type)
1013 struct bch_replicas_cpu_entry search;
1016 bkey_to_replicas(e, data_type, &search, &max_dev);
1018 return max_dev < replicas_dev_slots(r) &&
1019 eytzinger0_find(r->entries, r->nr,
1021 memcmp, &search) < r->nr;
1024 bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
1025 enum bch_data_type data_type)
1030 ret = replicas_has_extent(rcu_dereference(c->replicas),
1038 static int bch2_check_mark_super_slowpath(struct bch_fs *c,
1039 struct bkey_s_c_extent e,
1040 enum bch_data_type data_type)
1042 struct bch_replicas_cpu *gc_r;
1043 const struct bch_extent_ptr *ptr;
1044 struct bch_sb_field_replicas *sb_r;
1045 struct bch_replicas_entry *new_entry;
1046 unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
1049 mutex_lock(&c->sb_lock);
1051 gc_r = rcu_dereference_protected(c->replicas_gc,
1052 lockdep_is_held(&c->sb_lock));
1054 !replicas_has_extent(gc_r, e, data_type)) {
1055 ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
1060 /* recheck, might have raced */
1061 if (bch2_sb_has_replicas(c, e, data_type)) {
1062 mutex_unlock(&c->sb_lock);
1066 new_entry_bytes = sizeof(struct bch_replicas_entry) +
1067 bch2_extent_nr_dirty_ptrs(e.s_c);
1069 sb_r = bch2_sb_get_replicas(c->disk_sb);
1071 bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
1073 new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
1075 sb_r = bch2_fs_sb_resize_replicas(c,
1076 DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
1083 new_entry = (void *) sb_r + bytes;
1084 new_entry->data_type = data_type;
1087 extent_for_each_ptr(e, ptr)
1089 new_entry->devs[new_entry->nr++] = ptr->dev;
1091 ret = bch2_sb_replicas_to_cpu_replicas(c);
1093 memset(new_entry, 0,
1094 vstruct_end(&sb_r->field) - (void *) new_entry);
1098 bch2_write_super(c);
1100 mutex_unlock(&c->sb_lock);
1104 int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
1105 enum bch_data_type data_type)
1107 struct bch_replicas_cpu *gc_r;
1111 marked = replicas_has_extent(rcu_dereference(c->replicas),
1113 (!(gc_r = rcu_dereference(c->replicas_gc)) ||
1114 replicas_has_extent(gc_r, e, data_type));
1120 return bch2_check_mark_super_slowpath(c, e, data_type);
1123 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
1124 struct bch_devs_mask online_devs)
1126 struct bch_replicas_cpu_entry *e;
1127 struct bch_replicas_cpu *r;
1128 unsigned i, dev, dev_slots, nr_online, nr_offline;
1129 struct replicas_status ret;
1131 memset(&ret, 0, sizeof(ret));
1133 for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1134 ret.replicas[i].nr_online = UINT_MAX;
1137 r = rcu_dereference(c->replicas);
1138 dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
1140 for (i = 0; i < r->nr; i++) {
1141 e = cpu_replicas_entry(r, i);
1143 BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
1145 nr_online = nr_offline = 0;
1147 for (dev = 0; dev < dev_slots; dev++) {
1148 if (!replicas_test_dev(e, dev))
1151 if (test_bit(dev, online_devs.d))
1157 ret.replicas[e->data_type].nr_online =
1158 min(ret.replicas[e->data_type].nr_online,
1161 ret.replicas[e->data_type].nr_offline =
1162 max(ret.replicas[e->data_type].nr_offline,
1171 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1173 return __bch2_replicas_status(c, bch2_online_devs(c));
1176 bool bch2_have_enough_devs(struct bch_fs *c,
1177 struct replicas_status s,
1180 if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
1181 s.replicas[BCH_DATA_BTREE].nr_offline) &&
1182 !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1185 if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
1186 !s.replicas[BCH_DATA_BTREE].nr_online) &&
1187 !(flags & BCH_FORCE_IF_METADATA_LOST))
1190 if (s.replicas[BCH_DATA_USER].nr_offline &&
1191 !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1194 if (!s.replicas[BCH_DATA_USER].nr_online &&
1195 !(flags & BCH_FORCE_IF_DATA_LOST))
1201 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
1203 struct replicas_status s = bch2_replicas_status(c);
1206 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
1207 s.replicas[BCH_DATA_BTREE].nr_online)
1208 : s.replicas[BCH_DATA_USER].nr_online;
1211 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1213 struct bch_replicas_cpu_entry *e;
1214 struct bch_replicas_cpu *r;
1215 unsigned i, ret = 0;
1218 r = rcu_dereference(c->replicas);
1220 if (ca->dev_idx >= replicas_dev_slots(r))
1223 for (i = 0; i < r->nr; i++) {
1224 e = cpu_replicas_entry(r, i);
1226 if (replicas_test_dev(e, ca->dev_idx)) {
1227 ret |= 1 << e->data_type;
1237 static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
1239 struct bch_sb_field_members *mi;
1240 struct bch_sb_field_replicas *sb_r;
1241 struct bch_replicas_cpu *cpu_r = NULL;
1242 struct bch_replicas_entry *e;
1246 mi = bch2_sb_get_members(sb);
1247 sb_r = bch2_sb_get_replicas(sb);
1251 for_each_replicas_entry(sb_r, e) {
1252 err = "invalid replicas entry: invalid data type";
1253 if (e->data_type >= BCH_DATA_NR)
1256 err = "invalid replicas entry: too many devices";
1257 if (e->nr >= BCH_REPLICAS_MAX)
1260 err = "invalid replicas entry: invalid device";
1261 for (i = 0; i < e->nr; i++)
1262 if (!bch2_dev_exists(sb, mi, e->devs[i]))
1266 err = "cannot allocate memory";
1267 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1271 sort_cmp_size(cpu_r->entries,
1276 for (i = 0; i + 1 < cpu_r->nr; i++) {
1277 struct bch_replicas_cpu_entry *l =
1278 cpu_replicas_entry(cpu_r, i);
1279 struct bch_replicas_cpu_entry *r =
1280 cpu_replicas_entry(cpu_r, i + 1);
1282 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
1284 err = "duplicate replicas entry";
1285 if (!memcmp(l, r, cpu_r->entry_size))
1295 int bch2_replicas_gc_end(struct bch_fs *c, int err)
1297 struct bch_sb_field_replicas *sb_r;
1298 struct bch_replicas_cpu *r, *old_r;
1299 struct bch_replicas_entry *dst_e;
1300 size_t i, j, bytes, dev_slots;
1303 lockdep_assert_held(&c->replicas_gc_lock);
1305 mutex_lock(&c->sb_lock);
1307 r = rcu_dereference_protected(c->replicas_gc,
1308 lockdep_is_held(&c->sb_lock));
1311 rcu_assign_pointer(c->replicas_gc, NULL);
1316 dev_slots = replicas_dev_slots(r);
1318 bytes = sizeof(struct bch_sb_field_replicas);
1320 for (i = 0; i < r->nr; i++) {
1321 struct bch_replicas_cpu_entry *e =
1322 cpu_replicas_entry(r, i);
1324 bytes += sizeof(struct bch_replicas_entry);
1325 for (j = 0; j < r->entry_size - 1; j++)
1326 bytes += hweight8(e->devs[j]);
1329 sb_r = bch2_fs_sb_resize_replicas(c,
1330 DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
1336 memset(&sb_r->entries, 0,
1337 vstruct_end(&sb_r->field) -
1338 (void *) &sb_r->entries);
1340 dst_e = sb_r->entries;
1341 for (i = 0; i < r->nr; i++) {
1342 struct bch_replicas_cpu_entry *src_e =
1343 cpu_replicas_entry(r, i);
1345 dst_e->data_type = src_e->data_type;
1347 for (j = 0; j < dev_slots; j++)
1348 if (replicas_test_dev(src_e, j))
1349 dst_e->devs[dst_e->nr++] = j;
1351 dst_e = replicas_entry_next(dst_e);
1354 old_r = rcu_dereference_protected(c->replicas,
1355 lockdep_is_held(&c->sb_lock));
1356 rcu_assign_pointer(c->replicas, r);
1357 rcu_assign_pointer(c->replicas_gc, NULL);
1358 kfree_rcu(old_r, rcu);
1360 bch2_write_super(c);
1362 mutex_unlock(&c->sb_lock);
1366 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
1368 struct bch_replicas_cpu *r, *src;
1371 lockdep_assert_held(&c->replicas_gc_lock);
1373 mutex_lock(&c->sb_lock);
1374 BUG_ON(c->replicas_gc);
1376 src = rcu_dereference_protected(c->replicas,
1377 lockdep_is_held(&c->sb_lock));
1379 r = kzalloc(sizeof(struct bch_replicas_cpu) +
1380 src->nr * src->entry_size, GFP_NOIO);
1382 mutex_unlock(&c->sb_lock);
1386 r->entry_size = src->entry_size;
1389 for (i = 0; i < src->nr; i++) {
1390 struct bch_replicas_cpu_entry *dst_e =
1391 cpu_replicas_entry(r, r->nr);
1392 struct bch_replicas_cpu_entry *src_e =
1393 cpu_replicas_entry(src, i);
1395 if (!(src_e->data_type & typemask)) {
1396 memcpy(dst_e, src_e, r->entry_size);
1401 eytzinger0_sort(r->entries,
1406 rcu_assign_pointer(c->replicas_gc, r);
1407 mutex_unlock(&c->sb_lock);