]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/super-io.c
Update bcachefs sources to e82e656279 bcachefs: Cleanups for building in userspace
[bcachefs-tools-debian] / libbcachefs / super-io.c
1
2 #include "bcachefs.h"
3 #include "checksum.h"
4 #include "error.h"
5 #include "io.h"
6 #include "journal.h"
7 #include "super-io.h"
8 #include "super.h"
9 #include "vstructs.h"
10
11 #include <linux/backing-dev.h>
12 #include <linux/sort.h>
13
14 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
15 static const char *bch2_sb_validate_replicas(struct bch_sb *);
16
17 static inline void __bch2_sb_layout_size_assert(void)
18 {
19         BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
20 }
21
22 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
23                                       enum bch_sb_field_type type)
24 {
25         struct bch_sb_field *f;
26
27         /* XXX: need locking around superblock to access optional fields */
28
29         vstruct_for_each(sb, f)
30                 if (le32_to_cpu(f->type) == type)
31                         return f;
32         return NULL;
33 }
34
35 void bch2_free_super(struct bcache_superblock *sb)
36 {
37         if (sb->bio)
38                 bio_put(sb->bio);
39         if (!IS_ERR_OR_NULL(sb->bdev))
40                 blkdev_put(sb->bdev, sb->mode);
41
42         free_pages((unsigned long) sb->sb, sb->page_order);
43         memset(sb, 0, sizeof(*sb));
44 }
45
46 static int __bch2_super_realloc(struct bcache_superblock *sb, unsigned order)
47 {
48         struct bch_sb *new_sb;
49         struct bio *bio;
50
51         if (sb->page_order >= order && sb->sb)
52                 return 0;
53
54         if (dynamic_fault("bcachefs:add:super_realloc"))
55                 return -ENOMEM;
56
57         bio = bio_kmalloc(GFP_KERNEL, 1 << order);
58         if (!bio)
59                 return -ENOMEM;
60
61         if (sb->bio)
62                 bio_put(sb->bio);
63         sb->bio = bio;
64
65         new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
66         if (!new_sb)
67                 return -ENOMEM;
68
69         if (sb->sb)
70                 memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order);
71
72         free_pages((unsigned long) sb->sb, sb->page_order);
73         sb->sb = new_sb;
74
75         sb->page_order = order;
76
77         return 0;
78 }
79
80 static int bch2_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
81 {
82         u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
83         u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
84
85         if (new_bytes > max_bytes) {
86                 char buf[BDEVNAME_SIZE];
87
88                 pr_err("%s: superblock too big: want %llu but have %llu",
89                        bdevname(sb->bdev, buf), new_bytes, max_bytes);
90                 return -ENOSPC;
91         }
92
93         return __bch2_super_realloc(sb, get_order(new_bytes));
94 }
95
96 static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
97 {
98         u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
99         struct bch_sb *sb;
100         unsigned order = get_order(bytes);
101
102         if (c->disk_sb && order <= c->disk_sb_order)
103                 return 0;
104
105         sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
106         if (!sb)
107                 return -ENOMEM;
108
109         if (c->disk_sb)
110                 memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
111
112         free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
113
114         c->disk_sb = sb;
115         c->disk_sb_order = order;
116         return 0;
117 }
118
119 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
120                                                   struct bch_sb_field *f,
121                                                   unsigned u64s)
122 {
123         unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
124
125         if (!f) {
126                 f = vstruct_last(sb);
127                 memset(f, 0, sizeof(u64) * u64s);
128                 f->u64s = cpu_to_le32(u64s);
129                 f->type = 0;
130         } else {
131                 void *src, *dst;
132
133                 src = vstruct_end(f);
134                 f->u64s = cpu_to_le32(u64s);
135                 dst = vstruct_end(f);
136
137                 memmove(dst, src, vstruct_end(sb) - src);
138
139                 if (dst > src)
140                         memset(src, 0, dst - src);
141         }
142
143         le32_add_cpu(&sb->u64s, u64s - old_u64s);
144
145         return f;
146 }
147
148 struct bch_sb_field *bch2_sb_field_resize(struct bcache_superblock *sb,
149                                          enum bch_sb_field_type type,
150                                          unsigned u64s)
151 {
152         struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
153         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
154         ssize_t d = -old_u64s + u64s;
155
156         if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
157                 return NULL;
158
159         f = __bch2_sb_field_resize(sb->sb, f, u64s);
160         f->type = type;
161         return f;
162 }
163
164 struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
165                                             enum bch_sb_field_type type,
166                                             unsigned u64s)
167 {
168         struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
169         ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
170         ssize_t d = -old_u64s + u64s;
171         struct bch_dev *ca;
172         unsigned i;
173
174         lockdep_assert_held(&c->sb_lock);
175
176         if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
177                 return NULL;
178
179         /* XXX: we're not checking that offline device have enough space */
180
181         for_each_online_member(ca, c, i) {
182                 struct bcache_superblock *sb = &ca->disk_sb;
183
184                 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
185                         percpu_ref_put(&ca->ref);
186                         return NULL;
187                 }
188         }
189
190         f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
191         f->type = type;
192         return f;
193 }
194
195 static const char *validate_sb_layout(struct bch_sb_layout *layout)
196 {
197         u64 offset, prev_offset, max_sectors;
198         unsigned i;
199
200         if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
201                 return "Not a bcachefs superblock layout";
202
203         if (layout->layout_type != 0)
204                 return "Invalid superblock layout type";
205
206         if (!layout->nr_superblocks)
207                 return "Invalid superblock layout: no superblocks";
208
209         if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
210                 return "Invalid superblock layout: too many superblocks";
211
212         max_sectors = 1 << layout->sb_max_size_bits;
213
214         prev_offset = le64_to_cpu(layout->sb_offset[0]);
215
216         for (i = 1; i < layout->nr_superblocks; i++) {
217                 offset = le64_to_cpu(layout->sb_offset[i]);
218
219                 if (offset < prev_offset + max_sectors)
220                         return "Invalid superblock layout: superblocks overlap";
221                 prev_offset = offset;
222         }
223
224         return NULL;
225 }
226
227 static int u64_cmp(const void *_l, const void *_r)
228 {
229         u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
230
231         return l < r ? -1 : l > r ? 1 : 0;
232 }
233
234 const char *bch2_sb_validate_journal(struct bch_sb *sb,
235                                      struct bch_member_cpu mi)
236 {
237         struct bch_sb_field_journal *journal;
238         const char *err;
239         unsigned nr;
240         unsigned i;
241         u64 *b;
242
243         journal = bch2_sb_get_journal(sb);
244         if (!journal)
245                 return NULL;
246
247         nr = bch2_nr_journal_buckets(journal);
248         if (!nr)
249                 return NULL;
250
251         b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
252         if (!b)
253                 return "cannot allocate memory";
254
255         for (i = 0; i < nr; i++)
256                 b[i] = le64_to_cpu(journal->buckets[i]);
257
258         sort(b, nr, sizeof(u64), u64_cmp, NULL);
259
260         err = "journal bucket at sector 0";
261         if (!b[0])
262                 goto err;
263
264         err = "journal bucket before first bucket";
265         if (b[0] < mi.first_bucket)
266                 goto err;
267
268         err = "journal bucket past end of device";
269         if (b[nr - 1] >= mi.nbuckets)
270                 goto err;
271
272         err = "duplicate journal buckets";
273         for (i = 0; i + 1 < nr; i++)
274                 if (b[i] == b[i + 1])
275                         goto err;
276
277         err = NULL;
278 err:
279         kfree(b);
280         return err;
281 }
282
283 static const char *bch2_sb_validate_members(struct bch_sb *sb)
284 {
285         struct bch_sb_field_members *mi;
286         unsigned i;
287
288         mi = bch2_sb_get_members(sb);
289         if (!mi)
290                 return "Invalid superblock: member info area missing";
291
292         if ((void *) (mi->members + sb->nr_devices) >
293             vstruct_end(&mi->field))
294                 return "Invalid superblock: bad member info";
295
296         for (i = 0; i < sb->nr_devices; i++) {
297                 if (!bch2_dev_exists(sb, mi, i))
298                         continue;
299
300                 if (le16_to_cpu(mi->members[i].bucket_size) <
301                     BCH_SB_BTREE_NODE_SIZE(sb))
302                         return "bucket size smaller than btree node size";
303         }
304
305         return NULL;
306 }
307
308 const char *bch2_sb_validate(struct bcache_superblock *disk_sb)
309 {
310         struct bch_sb *sb = disk_sb->sb;
311         struct bch_sb_field *f;
312         struct bch_sb_field_members *sb_mi;
313         struct bch_member_cpu mi;
314         const char *err;
315         u16 block_size;
316
317         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN ||
318             le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX)
319                 return"Unsupported superblock version";
320
321         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX)
322                 SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7);
323
324         block_size = le16_to_cpu(sb->block_size);
325
326         if (!is_power_of_2(block_size) ||
327             block_size > PAGE_SECTORS)
328                 return "Bad block size";
329
330         if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
331                 return "Bad user UUID";
332
333         if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
334                 return "Bad internal UUID";
335
336         if (!sb->nr_devices ||
337             sb->nr_devices <= sb->dev_idx ||
338             sb->nr_devices > BCH_SB_MEMBERS_MAX)
339                 return "Bad cache device number in set";
340
341         if (!BCH_SB_META_REPLICAS_WANT(sb) ||
342             BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
343                 return "Invalid number of metadata replicas";
344
345         if (!BCH_SB_META_REPLICAS_REQ(sb) ||
346             BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
347                 return "Invalid number of metadata replicas";
348
349         if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
350             BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
351                 return "Invalid number of data replicas";
352
353         if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
354             BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
355                 return "Invalid number of metadata replicas";
356
357         if (!BCH_SB_BTREE_NODE_SIZE(sb))
358                 return "Btree node size not set";
359
360         if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
361                 return "Btree node size not a power of two";
362
363         if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX)
364                 return "Btree node size too large";
365
366         if (BCH_SB_GC_RESERVE(sb) < 5)
367                 return "gc reserve percentage too small";
368
369         if (!sb->time_precision ||
370             le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
371                 return "invalid time precision";
372
373         /* validate layout */
374         err = validate_sb_layout(&sb->layout);
375         if (err)
376                 return err;
377
378         vstruct_for_each(sb, f) {
379                 if (!f->u64s)
380                         return "Invalid superblock: invalid optional field";
381
382                 if (vstruct_next(f) > vstruct_last(sb))
383                         return "Invalid superblock: invalid optional field";
384
385                 if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR)
386                         return "Invalid superblock: unknown optional field type";
387         }
388
389         err = bch2_sb_validate_members(sb);
390         if (err)
391                 return err;
392
393         sb_mi = bch2_sb_get_members(sb);
394         mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx);
395
396         if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) {
397                 struct bch_member *m;
398
399                 for (m = sb_mi->members;
400                      m < sb_mi->members + sb->nr_devices;
401                      m++)
402                         SET_BCH_MEMBER_DATA_ALLOWED(m, ~0);
403         }
404
405         if (mi.nbuckets > LONG_MAX)
406                 return "Too many buckets";
407
408         if (mi.nbuckets - mi.first_bucket < 1 << 10)
409                 return "Not enough buckets";
410
411         if (mi.bucket_size < block_size)
412                 return "Bad bucket size";
413
414         if (get_capacity(disk_sb->bdev->bd_disk) <
415             mi.bucket_size * mi.nbuckets)
416                 return "Invalid superblock: device too small";
417
418         err = bch2_sb_validate_journal(sb, mi);
419         if (err)
420                 return err;
421
422         err = bch2_sb_validate_replicas(sb);
423         if (err)
424                 return err;
425
426         sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
427
428         return NULL;
429 }
430
431 /* device open: */
432
433 static const char *bch2_blkdev_open(const char *path, fmode_t mode,
434                                    void *holder, struct block_device **ret)
435 {
436         struct block_device *bdev;
437
438         *ret = NULL;
439         bdev = blkdev_get_by_path(path, mode, holder);
440         if (bdev == ERR_PTR(-EBUSY))
441                 return "device busy";
442
443         if (IS_ERR(bdev))
444                 return "failed to open device";
445
446         if (mode & FMODE_WRITE)
447                 bdev_get_queue(bdev)->backing_dev_info->capabilities
448                         |= BDI_CAP_STABLE_WRITES;
449
450         *ret = bdev;
451         return NULL;
452 }
453
454 static void bch2_sb_update(struct bch_fs *c)
455 {
456         struct bch_sb *src = c->disk_sb;
457         struct bch_sb_field_members *mi = bch2_sb_get_members(src);
458         struct bch_dev *ca;
459         unsigned i;
460
461         lockdep_assert_held(&c->sb_lock);
462
463         c->sb.uuid              = src->uuid;
464         c->sb.user_uuid         = src->user_uuid;
465         c->sb.block_size        = le16_to_cpu(src->block_size);
466         c->sb.btree_node_size   = BCH_SB_BTREE_NODE_SIZE(src);
467         c->sb.nr_devices        = src->nr_devices;
468         c->sb.clean             = BCH_SB_CLEAN(src);
469         c->sb.str_hash_type     = BCH_SB_STR_HASH_TYPE(src);
470         c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
471         c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
472         c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
473         c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
474         c->sb.time_precision    = le32_to_cpu(src->time_precision);
475
476         for_each_member_device(ca, c, i)
477                 ca->mi = bch2_mi_to_cpu(mi->members + i);
478 }
479
480 /* doesn't copy member info */
481 static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
482 {
483         struct bch_sb_field *src_f, *dst_f;
484
485         dst->version            = src->version;
486         dst->seq                = src->seq;
487         dst->uuid               = src->uuid;
488         dst->user_uuid          = src->user_uuid;
489         memcpy(dst->label,      src->label, sizeof(dst->label));
490
491         dst->block_size         = src->block_size;
492         dst->nr_devices         = src->nr_devices;
493
494         dst->time_base_lo       = src->time_base_lo;
495         dst->time_base_hi       = src->time_base_hi;
496         dst->time_precision     = src->time_precision;
497
498         memcpy(dst->flags,      src->flags,     sizeof(dst->flags));
499         memcpy(dst->features,   src->features,  sizeof(dst->features));
500         memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
501
502         vstruct_for_each(src, src_f) {
503                 if (src_f->type == BCH_SB_FIELD_journal)
504                         continue;
505
506                 dst_f = bch2_sb_field_get(dst, src_f->type);
507                 dst_f = __bch2_sb_field_resize(dst, dst_f,
508                                 le32_to_cpu(src_f->u64s));
509
510                 memcpy(dst_f, src_f, vstruct_bytes(src_f));
511         }
512 }
513
514 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
515 {
516         struct bch_sb_field_journal *journal_buckets =
517                 bch2_sb_get_journal(src);
518         unsigned journal_u64s = journal_buckets
519                 ? le32_to_cpu(journal_buckets->field.u64s)
520                 : 0;
521         int ret;
522
523         lockdep_assert_held(&c->sb_lock);
524
525         if (bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s))
526                 return -ENOMEM;
527
528         __copy_super(c->disk_sb, src);
529
530         ret = bch2_sb_replicas_to_cpu_replicas(c);
531         if (ret)
532                 return ret;
533
534         bch2_sb_update(c);
535         return 0;
536 }
537
538 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
539 {
540         struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
541         struct bch_sb_field_journal *journal_buckets =
542                 bch2_sb_get_journal(dst);
543         unsigned journal_u64s = journal_buckets
544                 ? le32_to_cpu(journal_buckets->field.u64s)
545                 : 0;
546         unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
547         int ret;
548
549         ret = bch2_sb_realloc(&ca->disk_sb, u64s);
550         if (ret)
551                 return ret;
552
553         __copy_super(dst, src);
554
555         return 0;
556 }
557
558 /* read superblock: */
559
560 static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
561 {
562         struct bch_csum csum;
563         size_t bytes;
564         unsigned order;
565 reread:
566         bio_reset(sb->bio);
567         sb->bio->bi_bdev = sb->bdev;
568         sb->bio->bi_iter.bi_sector = offset;
569         sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
570         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
571         bch2_bio_map(sb->bio, sb->sb);
572
573         if (submit_bio_wait(sb->bio))
574                 return "IO error";
575
576         if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
577                 return "Not a bcachefs superblock";
578
579         if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN ||
580             le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX)
581                 return"Unsupported superblock version";
582
583         bytes = vstruct_bytes(sb->sb);
584
585         if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
586                 return "Bad superblock: too big";
587
588         order = get_order(bytes);
589         if (order > sb->page_order) {
590                 if (__bch2_super_realloc(sb, order))
591                         return "cannot allocate memory";
592                 goto reread;
593         }
594
595         if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
596                 return "unknown csum type";
597
598         /* XXX: verify MACs */
599         csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
600                             (struct nonce) { 0 }, sb->sb);
601
602         if (bch2_crc_cmp(csum, sb->sb->csum))
603                 return "bad checksum reading superblock";
604
605         return NULL;
606 }
607
608 const char *bch2_read_super(struct bcache_superblock *sb,
609                            struct bch_opts opts,
610                            const char *path)
611 {
612         u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
613         struct bch_sb_layout layout;
614         const char *err;
615         unsigned i;
616
617         memset(sb, 0, sizeof(*sb));
618         sb->mode = FMODE_READ;
619
620         if (!(opt_defined(opts.noexcl) && opts.noexcl))
621                 sb->mode |= FMODE_EXCL;
622
623         if (!(opt_defined(opts.nochanges) && opts.nochanges))
624                 sb->mode |= FMODE_WRITE;
625
626         err = bch2_blkdev_open(path, sb->mode, sb, &sb->bdev);
627         if (err)
628                 return err;
629
630         err = "cannot allocate memory";
631         if (__bch2_super_realloc(sb, 0))
632                 goto err;
633
634         err = "dynamic fault";
635         if (bch2_fs_init_fault("read_super"))
636                 goto err;
637
638         err = read_one_super(sb, offset);
639         if (!err)
640                 goto got_super;
641
642         if (offset != BCH_SB_SECTOR) {
643                 pr_err("error reading superblock: %s", err);
644                 goto err;
645         }
646
647         pr_err("error reading default superblock: %s", err);
648
649         /*
650          * Error reading primary superblock - read location of backup
651          * superblocks:
652          */
653         bio_reset(sb->bio);
654         sb->bio->bi_bdev = sb->bdev;
655         sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
656         sb->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout);
657         bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
658         /*
659          * use sb buffer to read layout, since sb buffer is page aligned but
660          * layout won't be:
661          */
662         bch2_bio_map(sb->bio, sb->sb);
663
664         err = "IO error";
665         if (submit_bio_wait(sb->bio))
666                 goto err;
667
668         memcpy(&layout, sb->sb, sizeof(layout));
669         err = validate_sb_layout(&layout);
670         if (err)
671                 goto err;
672
673         for (i = 0; i < layout.nr_superblocks; i++) {
674                 u64 offset = le64_to_cpu(layout.sb_offset[i]);
675
676                 if (offset == BCH_SB_SECTOR)
677                         continue;
678
679                 err = read_one_super(sb, offset);
680                 if (!err)
681                         goto got_super;
682         }
683         goto err;
684 got_super:
685         pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
686                  le64_to_cpu(sb->sb->version),
687                  le64_to_cpu(sb->sb->flags),
688                  le64_to_cpu(sb->sb->seq),
689                  le16_to_cpu(sb->sb->u64s));
690
691         err = "Superblock block size smaller than device block size";
692         if (le16_to_cpu(sb->sb->block_size) << 9 <
693             bdev_logical_block_size(sb->bdev))
694                 goto err;
695
696         return NULL;
697 err:
698         bch2_free_super(sb);
699         return err;
700 }
701
702 /* write superblock: */
703
704 static void write_super_endio(struct bio *bio)
705 {
706         struct bch_dev *ca = bio->bi_private;
707
708         /* XXX: return errors directly */
709
710         if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
711                 ca->sb_write_error = 1;
712
713         closure_put(&ca->fs->sb_write);
714         percpu_ref_put(&ca->io_ref);
715 }
716
717 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
718 {
719         struct bch_sb *sb = ca->disk_sb.sb;
720         struct bio *bio = ca->disk_sb.bio;
721
722         sb->offset = sb->layout.sb_offset[idx];
723
724         SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
725         sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
726                                 (struct nonce) { 0 }, sb);
727
728         bio_reset(bio);
729         bio->bi_bdev            = ca->disk_sb.bdev;
730         bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
731         bio->bi_iter.bi_size    =
732                 roundup(vstruct_bytes(sb),
733                         bdev_logical_block_size(ca->disk_sb.bdev));
734         bio->bi_end_io          = write_super_endio;
735         bio->bi_private         = ca;
736         bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
737         bch2_bio_map(bio, sb);
738
739         this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB],
740                      bio_sectors(bio));
741
742         percpu_ref_get(&ca->io_ref);
743         closure_bio_submit(bio, &c->sb_write);
744 }
745
746 void bch2_write_super(struct bch_fs *c)
747 {
748         struct closure *cl = &c->sb_write;
749         struct bch_dev *ca;
750         unsigned i, sb = 0, nr_wrote;
751         const char *err;
752         struct bch_devs_mask sb_written;
753         bool wrote, can_mount_without_written, can_mount_with_written;
754
755         lockdep_assert_held(&c->sb_lock);
756
757         closure_init_stack(cl);
758         memset(&sb_written, 0, sizeof(sb_written));
759
760         le64_add_cpu(&c->disk_sb->seq, 1);
761
762         for_each_online_member(ca, c, i)
763                 bch2_sb_from_fs(c, ca);
764
765         for_each_online_member(ca, c, i) {
766                 err = bch2_sb_validate(&ca->disk_sb);
767                 if (err) {
768                         bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
769                         goto out;
770                 }
771         }
772
773         if (c->opts.nochanges ||
774             test_bit(BCH_FS_ERROR, &c->flags))
775                 goto out;
776
777         for_each_online_member(ca, c, i) {
778                 __set_bit(ca->dev_idx, sb_written.d);
779                 ca->sb_write_error = 0;
780         }
781
782         do {
783                 wrote = false;
784                 for_each_online_member(ca, c, i)
785                         if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
786                                 write_one_super(c, ca, sb);
787                                 wrote = true;
788                         }
789                 closure_sync(cl);
790                 sb++;
791         } while (wrote);
792
793         for_each_online_member(ca, c, i)
794                 if (ca->sb_write_error)
795                         __clear_bit(ca->dev_idx, sb_written.d);
796
797         nr_wrote = dev_mask_nr(&sb_written);
798
799         can_mount_with_written =
800                 bch2_have_enough_devs(c,
801                         __bch2_replicas_status(c, sb_written),
802                         BCH_FORCE_IF_DEGRADED);
803
804         for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
805                 sb_written.d[i] = ~sb_written.d[i];
806
807         can_mount_without_written =
808                 bch2_have_enough_devs(c,
809                         __bch2_replicas_status(c, sb_written),
810                         BCH_FORCE_IF_DEGRADED);
811
812         /*
813          * If we would be able to mount _without_ the devices we successfully
814          * wrote superblocks to, we weren't able to write to enough devices:
815          *
816          * Exception: if we can mount without the successes because we haven't
817          * written anything (new filesystem), we continue if we'd be able to
818          * mount with the devices we did successfully write to:
819          */
820         bch2_fs_fatal_err_on(!nr_wrote ||
821                              (can_mount_without_written &&
822                               !can_mount_with_written), c,
823                 "Unable to write superblock to sufficient devices");
824 out:
825         /* Make new options visible after they're persistent: */
826         bch2_sb_update(c);
827 }
828
829 /* replica information: */
830
831 static inline struct bch_replicas_cpu_entry *
832 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
833 {
834         return (void *) r->entries + r->entry_size * i;
835 }
836
837 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
838                                      unsigned dev)
839 {
840         return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
841 }
842
843 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
844                                     unsigned dev)
845 {
846         e->devs[dev >> 3] |= 1 << (dev & 7);
847 }
848
849 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
850 {
851         return (r->entry_size -
852                 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
853 }
854
855 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
856                                         unsigned *nr,
857                                         unsigned *bytes,
858                                         unsigned *max_dev)
859 {
860         struct bch_replicas_entry *i;
861         unsigned j;
862
863         *nr     = 0;
864         *bytes  = sizeof(*r);
865         *max_dev = 0;
866
867         if (!r)
868                 return;
869
870         for_each_replicas_entry(r, i) {
871                 for (j = 0; j < i->nr; j++)
872                         *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
873                 (*nr)++;
874         }
875
876         *bytes = (void *) i - (void *) r;
877 }
878
879 static struct bch_replicas_cpu *
880 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
881 {
882         struct bch_replicas_cpu *cpu_r;
883         unsigned i, nr, bytes, max_dev, entry_size;
884
885         bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
886
887         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
888                 DIV_ROUND_UP(max_dev + 1, 8);
889
890         cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
891                         nr * entry_size, GFP_NOIO);
892         if (!cpu_r)
893                 return NULL;
894
895         cpu_r->nr               = nr;
896         cpu_r->entry_size       = entry_size;
897
898         if (nr) {
899                 struct bch_replicas_cpu_entry *dst =
900                         cpu_replicas_entry(cpu_r, 0);
901                 struct bch_replicas_entry *src = sb_r->entries;
902
903                 while (dst < cpu_replicas_entry(cpu_r, nr)) {
904                         dst->data_type = src->data_type;
905                         for (i = 0; i < src->nr; i++)
906                                 replicas_set_dev(dst, src->devs[i]);
907
908                         src     = replicas_entry_next(src);
909                         dst     = (void *) dst + entry_size;
910                 }
911         }
912
913         eytzinger0_sort(cpu_r->entries,
914                         cpu_r->nr,
915                         cpu_r->entry_size,
916                         memcmp, NULL);
917         return cpu_r;
918 }
919
920 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
921 {
922         struct bch_sb_field_replicas *sb_r;
923         struct bch_replicas_cpu *cpu_r, *old_r;
924
925         lockdep_assert_held(&c->sb_lock);
926
927         sb_r    = bch2_sb_get_replicas(c->disk_sb);
928         cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
929         if (!cpu_r)
930                 return -ENOMEM;
931
932         old_r = c->replicas;
933         rcu_assign_pointer(c->replicas, cpu_r);
934         if (old_r)
935                 kfree_rcu(old_r, rcu);
936
937         return 0;
938 }
939
940 static void bkey_to_replicas(struct bkey_s_c_extent e,
941                              enum bch_data_type data_type,
942                              struct bch_replicas_cpu_entry *r,
943                              unsigned *max_dev)
944 {
945         const struct bch_extent_ptr *ptr;
946
947         BUG_ON(!data_type ||
948                data_type == BCH_DATA_SB ||
949                data_type >= BCH_DATA_NR);
950
951         memset(r, 0, sizeof(*r));
952         r->data_type = data_type;
953
954         *max_dev = 0;
955
956         extent_for_each_ptr(e, ptr)
957                 if (!ptr->cached) {
958                         *max_dev = max_t(unsigned, *max_dev, ptr->dev);
959                         replicas_set_dev(r, ptr->dev);
960                 }
961 }
962
963 /*
964  * for when gc of replica information is in progress:
965  */
966 static int bch2_update_gc_replicas(struct bch_fs *c,
967                                    struct bch_replicas_cpu *gc_r,
968                                    struct bkey_s_c_extent e,
969                                    enum bch_data_type data_type)
970 {
971         struct bch_replicas_cpu_entry new_e;
972         struct bch_replicas_cpu *new;
973         unsigned i, nr, entry_size, max_dev;
974
975         bkey_to_replicas(e, data_type, &new_e, &max_dev);
976
977         entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
978                 DIV_ROUND_UP(max_dev + 1, 8);
979         entry_size = max(entry_size, gc_r->entry_size);
980         nr = gc_r->nr + 1;
981
982         new = kzalloc(sizeof(struct bch_replicas_cpu) +
983                       nr * entry_size, GFP_NOIO);
984         if (!new)
985                 return -ENOMEM;
986
987         new->nr         = nr;
988         new->entry_size = entry_size;
989
990         for (i = 0; i < gc_r->nr; i++)
991                 memcpy(cpu_replicas_entry(new, i),
992                        cpu_replicas_entry(gc_r, i),
993                        gc_r->entry_size);
994
995         memcpy(cpu_replicas_entry(new, nr - 1),
996                &new_e,
997                new->entry_size);
998
999         eytzinger0_sort(new->entries,
1000                         new->nr,
1001                         new->entry_size,
1002                         memcmp, NULL);
1003
1004         rcu_assign_pointer(c->replicas_gc, new);
1005         kfree_rcu(gc_r, rcu);
1006         return 0;
1007 }
1008
1009 static bool replicas_has_extent(struct bch_replicas_cpu *r,
1010                                 struct bkey_s_c_extent e,
1011                                 enum bch_data_type data_type)
1012 {
1013         struct bch_replicas_cpu_entry search;
1014         unsigned max_dev;
1015
1016         bkey_to_replicas(e, data_type, &search, &max_dev);
1017
1018         return max_dev < replicas_dev_slots(r) &&
1019                 eytzinger0_find(r->entries, r->nr,
1020                                 r->entry_size,
1021                                 memcmp, &search) < r->nr;
1022 }
1023
1024 bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
1025                           enum bch_data_type data_type)
1026 {
1027         bool ret;
1028
1029         rcu_read_lock();
1030         ret = replicas_has_extent(rcu_dereference(c->replicas),
1031                                   e, data_type);
1032         rcu_read_unlock();
1033
1034         return ret;
1035 }
1036
1037 noinline
1038 static int bch2_check_mark_super_slowpath(struct bch_fs *c,
1039                                           struct bkey_s_c_extent e,
1040                                           enum bch_data_type data_type)
1041 {
1042         struct bch_replicas_cpu *gc_r;
1043         const struct bch_extent_ptr *ptr;
1044         struct bch_sb_field_replicas *sb_r;
1045         struct bch_replicas_entry *new_entry;
1046         unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
1047         int ret = 0;
1048
1049         mutex_lock(&c->sb_lock);
1050
1051         gc_r = rcu_dereference_protected(c->replicas_gc,
1052                                          lockdep_is_held(&c->sb_lock));
1053         if (gc_r &&
1054             !replicas_has_extent(gc_r, e, data_type)) {
1055                 ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
1056                 if (ret)
1057                         goto err;
1058         }
1059
1060         /* recheck, might have raced */
1061         if (bch2_sb_has_replicas(c, e, data_type)) {
1062                 mutex_unlock(&c->sb_lock);
1063                 return 0;
1064         }
1065
1066         new_entry_bytes = sizeof(struct bch_replicas_entry) +
1067                 bch2_extent_nr_dirty_ptrs(e.s_c);
1068
1069         sb_r = bch2_sb_get_replicas(c->disk_sb);
1070
1071         bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
1072
1073         new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
1074
1075         sb_r = bch2_fs_sb_resize_replicas(c,
1076                         DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
1077                                      sizeof(u64)));
1078         if (!sb_r) {
1079                 ret = -ENOSPC;
1080                 goto err;
1081         }
1082
1083         new_entry = (void *) sb_r + bytes;
1084         new_entry->data_type = data_type;
1085         new_entry->nr = 0;
1086
1087         extent_for_each_ptr(e, ptr)
1088                 if (!ptr->cached)
1089                         new_entry->devs[new_entry->nr++] = ptr->dev;
1090
1091         ret = bch2_sb_replicas_to_cpu_replicas(c);
1092         if (ret) {
1093                 memset(new_entry, 0,
1094                        vstruct_end(&sb_r->field) - (void *) new_entry);
1095                 goto err;
1096         }
1097
1098         bch2_write_super(c);
1099 err:
1100         mutex_unlock(&c->sb_lock);
1101         return ret;
1102 }
1103
1104 int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
1105                           enum bch_data_type data_type)
1106 {
1107         struct bch_replicas_cpu *gc_r;
1108         bool marked;
1109
1110         rcu_read_lock();
1111         marked = replicas_has_extent(rcu_dereference(c->replicas),
1112                                      e, data_type) &&
1113                 (!(gc_r = rcu_dereference(c->replicas_gc)) ||
1114                  replicas_has_extent(gc_r, e, data_type));
1115         rcu_read_unlock();
1116
1117         if (marked)
1118                 return 0;
1119
1120         return bch2_check_mark_super_slowpath(c, e, data_type);
1121 }
1122
1123 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
1124                                         struct bch_devs_mask online_devs)
1125 {
1126         struct bch_replicas_cpu_entry *e;
1127         struct bch_replicas_cpu *r;
1128         unsigned i, dev, dev_slots, nr_online, nr_offline;
1129         struct replicas_status ret;
1130
1131         memset(&ret, 0, sizeof(ret));
1132
1133         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1134                 ret.replicas[i].nr_online = UINT_MAX;
1135
1136         rcu_read_lock();
1137         r = rcu_dereference(c->replicas);
1138         dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
1139
1140         for (i = 0; i < r->nr; i++) {
1141                 e = cpu_replicas_entry(r, i);
1142
1143                 BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
1144
1145                 nr_online = nr_offline = 0;
1146
1147                 for (dev = 0; dev < dev_slots; dev++) {
1148                         if (!replicas_test_dev(e, dev))
1149                                 continue;
1150
1151                         if (test_bit(dev, online_devs.d))
1152                                 nr_online++;
1153                         else
1154                                 nr_offline++;
1155                 }
1156
1157                 ret.replicas[e->data_type].nr_online =
1158                         min(ret.replicas[e->data_type].nr_online,
1159                             nr_online);
1160
1161                 ret.replicas[e->data_type].nr_offline =
1162                         max(ret.replicas[e->data_type].nr_offline,
1163                             nr_offline);
1164         }
1165
1166         rcu_read_unlock();
1167
1168         return ret;
1169 }
1170
1171 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1172 {
1173         return __bch2_replicas_status(c, bch2_online_devs(c));
1174 }
1175
1176 bool bch2_have_enough_devs(struct bch_fs *c,
1177                            struct replicas_status s,
1178                            unsigned flags)
1179 {
1180         if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
1181              s.replicas[BCH_DATA_BTREE].nr_offline) &&
1182             !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
1183                 return false;
1184
1185         if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
1186              !s.replicas[BCH_DATA_BTREE].nr_online) &&
1187             !(flags & BCH_FORCE_IF_METADATA_LOST))
1188                 return false;
1189
1190         if (s.replicas[BCH_DATA_USER].nr_offline &&
1191             !(flags & BCH_FORCE_IF_DATA_DEGRADED))
1192                 return false;
1193
1194         if (!s.replicas[BCH_DATA_USER].nr_online &&
1195             !(flags & BCH_FORCE_IF_DATA_LOST))
1196                 return false;
1197
1198         return true;
1199 }
1200
1201 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
1202 {
1203         struct replicas_status s = bch2_replicas_status(c);
1204
1205         return meta
1206                 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
1207                       s.replicas[BCH_DATA_BTREE].nr_online)
1208                 : s.replicas[BCH_DATA_USER].nr_online;
1209 }
1210
1211 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1212 {
1213         struct bch_replicas_cpu_entry *e;
1214         struct bch_replicas_cpu *r;
1215         unsigned i, ret = 0;
1216
1217         rcu_read_lock();
1218         r = rcu_dereference(c->replicas);
1219
1220         if (ca->dev_idx >= replicas_dev_slots(r))
1221                 goto out;
1222
1223         for (i = 0; i < r->nr; i++) {
1224                 e = cpu_replicas_entry(r, i);
1225
1226                 if (replicas_test_dev(e, ca->dev_idx)) {
1227                         ret |= 1 << e->data_type;
1228                         break;
1229                 }
1230         }
1231 out:
1232         rcu_read_unlock();
1233
1234         return ret;
1235 }
1236
1237 static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
1238 {
1239         struct bch_sb_field_members *mi;
1240         struct bch_sb_field_replicas *sb_r;
1241         struct bch_replicas_cpu *cpu_r = NULL;
1242         struct bch_replicas_entry *e;
1243         const char *err;
1244         unsigned i;
1245
1246         mi      = bch2_sb_get_members(sb);
1247         sb_r    = bch2_sb_get_replicas(sb);
1248         if (!sb_r)
1249                 return NULL;
1250
1251         for_each_replicas_entry(sb_r, e) {
1252                 err = "invalid replicas entry: invalid data type";
1253                 if (e->data_type >= BCH_DATA_NR)
1254                         goto err;
1255
1256                 err = "invalid replicas entry: too many devices";
1257                 if (e->nr >= BCH_REPLICAS_MAX)
1258                         goto err;
1259
1260                 err = "invalid replicas entry: invalid device";
1261                 for (i = 0; i < e->nr; i++)
1262                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
1263                                 goto err;
1264         }
1265
1266         err = "cannot allocate memory";
1267         cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
1268         if (!cpu_r)
1269                 goto err;
1270
1271         sort_cmp_size(cpu_r->entries,
1272                       cpu_r->nr,
1273                       cpu_r->entry_size,
1274                       memcmp, NULL);
1275
1276         for (i = 0; i + 1 < cpu_r->nr; i++) {
1277                 struct bch_replicas_cpu_entry *l =
1278                         cpu_replicas_entry(cpu_r, i);
1279                 struct bch_replicas_cpu_entry *r =
1280                         cpu_replicas_entry(cpu_r, i + 1);
1281
1282                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
1283
1284                 err = "duplicate replicas entry";
1285                 if (!memcmp(l, r, cpu_r->entry_size))
1286                         goto err;
1287         }
1288
1289         err = NULL;
1290 err:
1291         kfree(cpu_r);
1292         return err;
1293 }
1294
1295 int bch2_replicas_gc_end(struct bch_fs *c, int err)
1296 {
1297         struct bch_sb_field_replicas *sb_r;
1298         struct bch_replicas_cpu *r, *old_r;
1299         struct bch_replicas_entry *dst_e;
1300         size_t i, j, bytes, dev_slots;
1301         int ret = 0;
1302
1303         lockdep_assert_held(&c->replicas_gc_lock);
1304
1305         mutex_lock(&c->sb_lock);
1306
1307         r = rcu_dereference_protected(c->replicas_gc,
1308                                       lockdep_is_held(&c->sb_lock));
1309
1310         if (err) {
1311                 rcu_assign_pointer(c->replicas_gc, NULL);
1312                 kfree_rcu(r, rcu);
1313                 goto err;
1314         }
1315
1316         dev_slots = replicas_dev_slots(r);
1317
1318         bytes = sizeof(struct bch_sb_field_replicas);
1319
1320         for (i = 0; i < r->nr; i++) {
1321                 struct bch_replicas_cpu_entry *e =
1322                         cpu_replicas_entry(r, i);
1323
1324                 bytes += sizeof(struct bch_replicas_entry);
1325                 for (j = 0; j < r->entry_size - 1; j++)
1326                         bytes += hweight8(e->devs[j]);
1327         }
1328
1329         sb_r = bch2_fs_sb_resize_replicas(c,
1330                         DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
1331         if (!sb_r) {
1332                 ret = -ENOSPC;
1333                 goto err;
1334         }
1335
1336         memset(&sb_r->entries, 0,
1337                vstruct_end(&sb_r->field) -
1338                (void *) &sb_r->entries);
1339
1340         dst_e = sb_r->entries;
1341         for (i = 0; i < r->nr; i++) {
1342                 struct bch_replicas_cpu_entry *src_e =
1343                         cpu_replicas_entry(r, i);
1344
1345                 dst_e->data_type = src_e->data_type;
1346
1347                 for (j = 0; j < dev_slots; j++)
1348                         if (replicas_test_dev(src_e, j))
1349                                 dst_e->devs[dst_e->nr++] = j;
1350
1351                 dst_e = replicas_entry_next(dst_e);
1352         }
1353
1354         old_r = rcu_dereference_protected(c->replicas,
1355                                           lockdep_is_held(&c->sb_lock));
1356         rcu_assign_pointer(c->replicas, r);
1357         rcu_assign_pointer(c->replicas_gc, NULL);
1358         kfree_rcu(old_r, rcu);
1359
1360         bch2_write_super(c);
1361 err:
1362         mutex_unlock(&c->sb_lock);
1363         return ret;
1364 }
1365
1366 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
1367 {
1368         struct bch_replicas_cpu *r, *src;
1369         unsigned i;
1370
1371         lockdep_assert_held(&c->replicas_gc_lock);
1372
1373         mutex_lock(&c->sb_lock);
1374         BUG_ON(c->replicas_gc);
1375
1376         src = rcu_dereference_protected(c->replicas,
1377                                         lockdep_is_held(&c->sb_lock));
1378
1379         r = kzalloc(sizeof(struct bch_replicas_cpu) +
1380                     src->nr * src->entry_size, GFP_NOIO);
1381         if (!r) {
1382                 mutex_unlock(&c->sb_lock);
1383                 return -ENOMEM;
1384         }
1385
1386         r->entry_size = src->entry_size;
1387         r->nr = 0;
1388
1389         for (i = 0; i < src->nr; i++) {
1390                 struct bch_replicas_cpu_entry *dst_e =
1391                         cpu_replicas_entry(r, r->nr);
1392                 struct bch_replicas_cpu_entry *src_e =
1393                         cpu_replicas_entry(src, i);
1394
1395                 if (!(src_e->data_type & typemask)) {
1396                         memcpy(dst_e, src_e, r->entry_size);
1397                         r->nr++;
1398                 }
1399         }
1400
1401         eytzinger0_sort(r->entries,
1402                         r->nr,
1403                         r->entry_size,
1404                         memcmp, NULL);
1405
1406         rcu_assign_pointer(c->replicas_gc, r);
1407         mutex_unlock(&c->sb_lock);
1408
1409         return 0;
1410 }