]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/replicas.c
Update bcachefs sources to 75e8a078b8 bcachefs: improved flush_held_btree_writes()
[bcachefs-tools-debian] / libbcachefs / replicas.c
1
2 #include "bcachefs.h"
3 #include "journal.h"
4 #include "replicas.h"
5 #include "super-io.h"
6
7 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
8                                             struct bch_replicas_cpu *);
9
10 /* Replicas tracking - in memory: */
11
12 static inline int u8_cmp(u8 l, u8 r)
13 {
14         return (l > r) - (l < r);
15 }
16
17 static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
18 {
19 #ifdef CONFIG_BCACHES_DEBUG
20         unsigned i;
21
22         for (i = 0; i + 1 < e->nr_devs; i++)
23                 BUG_ON(e->devs[i] >= e->devs[i + 1]);
24 #endif
25 }
26
27 static void replicas_entry_sort(struct bch_replicas_entry *e)
28 {
29         bubble_sort(e->devs, e->nr_devs, u8_cmp);
30 }
31
32 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
33 {
34         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
35 }
36
37 void bch2_replicas_entry_to_text(struct printbuf *out,
38                                  struct bch_replicas_entry *e)
39 {
40         unsigned i;
41
42         pr_buf(out, "%s: %u/%u [",
43                bch2_data_types[e->data_type],
44                e->nr_required,
45                e->nr_devs);
46
47         for (i = 0; i < e->nr_devs; i++)
48                 pr_buf(out, i ? " %u" : "%u", e->devs[i]);
49         pr_buf(out, "]");
50 }
51
52 void bch2_cpu_replicas_to_text(struct printbuf *out,
53                               struct bch_replicas_cpu *r)
54 {
55         struct bch_replicas_entry *e;
56         bool first = true;
57
58         for_each_cpu_replicas_entry(r, e) {
59                 if (!first)
60                         pr_buf(out, " ");
61                 first = false;
62
63                 bch2_replicas_entry_to_text(out, e);
64         }
65 }
66
67 static void extent_to_replicas(struct bkey_s_c k,
68                                struct bch_replicas_entry *r)
69 {
70         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
71         const union bch_extent_entry *entry;
72         struct extent_ptr_decoded p;
73
74         r->nr_required  = 1;
75
76         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
77                 if (p.ptr.cached)
78                         continue;
79
80                 if (p.ec_nr) {
81                         r->nr_devs = 0;
82                         break;
83                 }
84
85                 r->devs[r->nr_devs++] = p.ptr.dev;
86         }
87 }
88
89 static void stripe_to_replicas(struct bkey_s_c k,
90                                struct bch_replicas_entry *r)
91 {
92         struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
93         const struct bch_extent_ptr *ptr;
94
95         r->nr_required  = s.v->nr_blocks - s.v->nr_redundant;
96
97         for (ptr = s.v->ptrs;
98              ptr < s.v->ptrs + s.v->nr_blocks;
99              ptr++)
100                 r->devs[r->nr_devs++] = ptr->dev;
101 }
102
103 static void bkey_to_replicas(struct bch_replicas_entry *e,
104                              struct bkey_s_c k)
105 {
106         e->nr_devs = 0;
107
108         switch (k.k->type) {
109         case KEY_TYPE_btree_ptr:
110                 e->data_type = BCH_DATA_BTREE;
111                 extent_to_replicas(k, e);
112                 break;
113         case KEY_TYPE_extent:
114                 e->data_type = BCH_DATA_USER;
115                 extent_to_replicas(k, e);
116                 break;
117         case KEY_TYPE_stripe:
118                 e->data_type = BCH_DATA_USER;
119                 stripe_to_replicas(k, e);
120                 break;
121         }
122
123         replicas_entry_sort(e);
124 }
125
126 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
127                               enum bch_data_type data_type,
128                               struct bch_devs_list devs)
129 {
130         unsigned i;
131
132         BUG_ON(!data_type ||
133                data_type == BCH_DATA_SB ||
134                data_type >= BCH_DATA_NR);
135
136         e->data_type    = data_type;
137         e->nr_devs      = 0;
138         e->nr_required  = 1;
139
140         for (i = 0; i < devs.nr; i++)
141                 e->devs[e->nr_devs++] = devs.devs[i];
142
143         replicas_entry_sort(e);
144 }
145
146 static struct bch_replicas_cpu
147 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
148                        struct bch_replicas_entry *new_entry)
149 {
150         unsigned i;
151         struct bch_replicas_cpu new = {
152                 .nr             = old->nr + 1,
153                 .entry_size     = max_t(unsigned, old->entry_size,
154                                         replicas_entry_bytes(new_entry)),
155         };
156
157         BUG_ON(!new_entry->data_type);
158         verify_replicas_entry_sorted(new_entry);
159
160         new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
161         if (!new.entries)
162                 return new;
163
164         for (i = 0; i < old->nr; i++)
165                 memcpy(cpu_replicas_entry(&new, i),
166                        cpu_replicas_entry(old, i),
167                        old->entry_size);
168
169         memcpy(cpu_replicas_entry(&new, old->nr),
170                new_entry,
171                replicas_entry_bytes(new_entry));
172
173         bch2_cpu_replicas_sort(&new);
174         return new;
175 }
176
177 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
178                                        struct bch_replicas_entry *search)
179 {
180         int idx, entry_size = replicas_entry_bytes(search);
181
182         if (unlikely(entry_size > r->entry_size))
183                 return -1;
184
185         verify_replicas_entry_sorted(search);
186
187 #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
188         idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
189                               entry_cmp, search);
190 #undef entry_cmp
191
192         return idx < r->nr ? idx : -1;
193 }
194
195 int bch2_replicas_entry_idx(struct bch_fs *c,
196                             struct bch_replicas_entry *search)
197 {
198         replicas_entry_sort(search);
199
200         return __replicas_entry_idx(&c->replicas, search);
201 }
202
203 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
204                                  struct bch_replicas_entry *search)
205 {
206         return __replicas_entry_idx(r, search) >= 0;
207 }
208
209 bool bch2_replicas_marked(struct bch_fs *c,
210                           struct bch_replicas_entry *search,
211                           bool check_gc_replicas)
212 {
213         bool marked;
214
215         if (!search->nr_devs)
216                 return true;
217
218         verify_replicas_entry_sorted(search);
219
220         percpu_down_read_preempt_disable(&c->mark_lock);
221         marked = __replicas_has_entry(&c->replicas, search) &&
222                 (!check_gc_replicas ||
223                  likely((!c->replicas_gc.entries)) ||
224                  __replicas_has_entry(&c->replicas_gc, search));
225         percpu_up_read_preempt_enable(&c->mark_lock);
226
227         return marked;
228 }
229
230 static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
231                                     struct bch_replicas_cpu *dst_r,
232                                     struct bch_fs_usage __percpu *src_p,
233                                     struct bch_replicas_cpu *src_r)
234 {
235         unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
236         struct bch_fs_usage *dst, *src = (void *)
237                 bch2_acc_percpu_u64s((void *) src_p, src_nr);
238         int src_idx, dst_idx;
239
240         preempt_disable();
241         dst = this_cpu_ptr(dst_p);
242         preempt_enable();
243
244         *dst = *src;
245
246         for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
247                 if (!src->replicas[src_idx])
248                         continue;
249
250                 dst_idx = __replicas_entry_idx(dst_r,
251                                 cpu_replicas_entry(src_r, src_idx));
252                 BUG_ON(dst_idx < 0);
253
254                 dst->replicas[dst_idx] = src->replicas[src_idx];
255         }
256 }
257
258 /*
259  * Resize filesystem accounting:
260  */
261 static int replicas_table_update(struct bch_fs *c,
262                                  struct bch_replicas_cpu *new_r)
263 {
264         struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
265         struct bch_fs_usage __percpu *new_scratch = NULL;
266         unsigned bytes = sizeof(struct bch_fs_usage) +
267                 sizeof(u64) * new_r->nr;
268         int ret = -ENOMEM;
269
270         if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
271                                                 GFP_NOIO)) ||
272             (c->usage[1] &&
273              !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
274                                                  GFP_NOIO))) ||
275             !(new_scratch  = __alloc_percpu_gfp(bytes, sizeof(u64),
276                                                 GFP_NOIO)))
277                 goto err;
278
279         if (c->usage[0])
280                 __replicas_table_update(new_usage[0],   new_r,
281                                         c->usage[0],    &c->replicas);
282         if (c->usage[1])
283                 __replicas_table_update(new_usage[1],   new_r,
284                                         c->usage[1],    &c->replicas);
285
286         swap(c->usage[0],       new_usage[0]);
287         swap(c->usage[1],       new_usage[1]);
288         swap(c->usage_scratch,  new_scratch);
289         swap(c->replicas,       *new_r);
290         ret = 0;
291 err:
292         free_percpu(new_scratch);
293         free_percpu(new_usage[1]);
294         free_percpu(new_usage[0]);
295         return ret;
296 }
297
298 static unsigned reserve_journal_replicas(struct bch_fs *c,
299                                      struct bch_replicas_cpu *r)
300 {
301         struct bch_replicas_entry *e;
302         unsigned journal_res_u64s = 0;
303
304         /* nr_inodes: */
305         journal_res_u64s +=
306                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
307
308         /* key_version: */
309         journal_res_u64s +=
310                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
311
312         /* persistent_reserved: */
313         journal_res_u64s +=
314                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
315                 BCH_REPLICAS_MAX;
316
317         for_each_cpu_replicas_entry(r, e)
318                 journal_res_u64s +=
319                         DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
320                                      e->nr_devs, sizeof(u64));
321         return journal_res_u64s;
322 }
323
324 noinline
325 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
326                                 struct bch_replicas_entry *new_entry)
327 {
328         struct bch_replicas_cpu new_r, new_gc;
329         int ret = -ENOMEM;
330
331         memset(&new_r, 0, sizeof(new_r));
332         memset(&new_gc, 0, sizeof(new_gc));
333
334         mutex_lock(&c->sb_lock);
335
336         if (c->replicas_gc.entries &&
337             !__replicas_has_entry(&c->replicas_gc, new_entry)) {
338                 new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
339                 if (!new_gc.entries)
340                         goto err;
341         }
342
343         if (!__replicas_has_entry(&c->replicas, new_entry)) {
344                 new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
345                 if (!new_r.entries)
346                         goto err;
347
348                 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
349                 if (ret)
350                         goto err;
351
352                 bch2_journal_entry_res_resize(&c->journal,
353                                 &c->replicas_journal_res,
354                                 reserve_journal_replicas(c, &new_r));
355         }
356
357         if (!new_r.entries &&
358             !new_gc.entries)
359                 goto out;
360
361         /* allocations done, now commit: */
362
363         if (new_r.entries)
364                 bch2_write_super(c);
365
366         /* don't update in memory replicas until changes are persistent */
367         percpu_down_write(&c->mark_lock);
368         if (new_r.entries)
369                 ret = replicas_table_update(c, &new_r);
370         if (new_gc.entries)
371                 swap(new_gc, c->replicas_gc);
372         percpu_up_write(&c->mark_lock);
373 out:
374         ret = 0;
375 err:
376         mutex_unlock(&c->sb_lock);
377
378         kfree(new_r.entries);
379         kfree(new_gc.entries);
380
381         return ret;
382 }
383
384 int bch2_mark_replicas(struct bch_fs *c,
385                        struct bch_replicas_entry *r)
386 {
387         return likely(bch2_replicas_marked(c, r, true))
388                 ? 0
389                 : bch2_mark_replicas_slowpath(c, r);
390 }
391
392 bool bch2_bkey_replicas_marked(struct bch_fs *c,
393                                struct bkey_s_c k,
394                                bool check_gc_replicas)
395 {
396         struct bch_replicas_padded search;
397         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
398         unsigned i;
399
400         for (i = 0; i < cached.nr; i++) {
401                 bch2_replicas_entry_cached(&search.e, cached.devs[i]);
402
403                 if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
404                         return false;
405         }
406
407         bkey_to_replicas(&search.e, k);
408
409         return bch2_replicas_marked(c, &search.e, check_gc_replicas);
410 }
411
412 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
413 {
414         struct bch_replicas_padded search;
415         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
416         unsigned i;
417         int ret;
418
419         for (i = 0; i < cached.nr; i++) {
420                 bch2_replicas_entry_cached(&search.e, cached.devs[i]);
421
422                 ret = bch2_mark_replicas(c, &search.e);
423                 if (ret)
424                         return ret;
425         }
426
427         bkey_to_replicas(&search.e, k);
428
429         return bch2_mark_replicas(c, &search.e);
430 }
431
432 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
433 {
434         unsigned i;
435
436         lockdep_assert_held(&c->replicas_gc_lock);
437
438         mutex_lock(&c->sb_lock);
439
440         if (ret)
441                 goto err;
442
443         /*
444          * this is kind of crappy; the replicas gc mechanism needs to be ripped
445          * out
446          */
447
448         for (i = 0; i < c->replicas.nr; i++) {
449                 struct bch_replicas_entry *e =
450                         cpu_replicas_entry(&c->replicas, i);
451                 struct bch_replicas_cpu n;
452                 u64 v;
453
454                 if (__replicas_has_entry(&c->replicas_gc, e))
455                         continue;
456
457                 v = percpu_u64_get(&c->usage[0]->replicas[i]);
458                 if (!v)
459                         continue;
460
461                 n = cpu_replicas_add_entry(&c->replicas_gc, e);
462                 if (!n.entries) {
463                         ret = -ENOSPC;
464                         goto err;
465                 }
466
467                 percpu_down_write(&c->mark_lock);
468                 swap(n, c->replicas_gc);
469                 percpu_up_write(&c->mark_lock);
470
471                 kfree(n.entries);
472         }
473
474         if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
475                 ret = -ENOSPC;
476                 goto err;
477         }
478
479         bch2_write_super(c);
480
481         /* don't update in memory replicas until changes are persistent */
482 err:
483         percpu_down_write(&c->mark_lock);
484         if (!ret)
485                 ret = replicas_table_update(c, &c->replicas_gc);
486
487         kfree(c->replicas_gc.entries);
488         c->replicas_gc.entries = NULL;
489         percpu_up_write(&c->mark_lock);
490
491         mutex_unlock(&c->sb_lock);
492         return ret;
493 }
494
495 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
496 {
497         struct bch_replicas_entry *e;
498         unsigned i = 0;
499
500         lockdep_assert_held(&c->replicas_gc_lock);
501
502         mutex_lock(&c->sb_lock);
503         BUG_ON(c->replicas_gc.entries);
504
505         c->replicas_gc.nr               = 0;
506         c->replicas_gc.entry_size       = 0;
507
508         for_each_cpu_replicas_entry(&c->replicas, e)
509                 if (!((1 << e->data_type) & typemask)) {
510                         c->replicas_gc.nr++;
511                         c->replicas_gc.entry_size =
512                                 max_t(unsigned, c->replicas_gc.entry_size,
513                                       replicas_entry_bytes(e));
514                 }
515
516         c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
517                                          c->replicas_gc.entry_size,
518                                          GFP_NOIO);
519         if (!c->replicas_gc.entries) {
520                 mutex_unlock(&c->sb_lock);
521                 return -ENOMEM;
522         }
523
524         for_each_cpu_replicas_entry(&c->replicas, e)
525                 if (!((1 << e->data_type) & typemask))
526                         memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
527                                e, c->replicas_gc.entry_size);
528
529         bch2_cpu_replicas_sort(&c->replicas_gc);
530         mutex_unlock(&c->sb_lock);
531
532         return 0;
533 }
534
535 int bch2_replicas_set_usage(struct bch_fs *c,
536                             struct bch_replicas_entry *r,
537                             u64 sectors)
538 {
539         int ret, idx = bch2_replicas_entry_idx(c, r);
540
541         if (idx < 0) {
542                 struct bch_replicas_cpu n;
543
544                 n = cpu_replicas_add_entry(&c->replicas, r);
545                 if (!n.entries)
546                         return -ENOMEM;
547
548                 ret = replicas_table_update(c, &n);
549                 if (ret)
550                         return ret;
551
552                 kfree(n.entries);
553
554                 idx = bch2_replicas_entry_idx(c, r);
555                 BUG_ON(ret < 0);
556         }
557
558         percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
559
560         return 0;
561 }
562
563 /* Replicas tracking - superblock: */
564
565 static int
566 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
567                                    struct bch_replicas_cpu *cpu_r)
568 {
569         struct bch_replicas_entry *e, *dst;
570         unsigned nr = 0, entry_size = 0, idx = 0;
571
572         for_each_replicas_entry(sb_r, e) {
573                 entry_size = max_t(unsigned, entry_size,
574                                    replicas_entry_bytes(e));
575                 nr++;
576         }
577
578         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
579         if (!cpu_r->entries)
580                 return -ENOMEM;
581
582         cpu_r->nr               = nr;
583         cpu_r->entry_size       = entry_size;
584
585         for_each_replicas_entry(sb_r, e) {
586                 dst = cpu_replicas_entry(cpu_r, idx++);
587                 memcpy(dst, e, replicas_entry_bytes(e));
588                 replicas_entry_sort(dst);
589         }
590
591         return 0;
592 }
593
594 static int
595 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
596                                       struct bch_replicas_cpu *cpu_r)
597 {
598         struct bch_replicas_entry_v0 *e;
599         unsigned nr = 0, entry_size = 0, idx = 0;
600
601         for_each_replicas_entry(sb_r, e) {
602                 entry_size = max_t(unsigned, entry_size,
603                                    replicas_entry_bytes(e));
604                 nr++;
605         }
606
607         entry_size += sizeof(struct bch_replicas_entry) -
608                 sizeof(struct bch_replicas_entry_v0);
609
610         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
611         if (!cpu_r->entries)
612                 return -ENOMEM;
613
614         cpu_r->nr               = nr;
615         cpu_r->entry_size       = entry_size;
616
617         for_each_replicas_entry(sb_r, e) {
618                 struct bch_replicas_entry *dst =
619                         cpu_replicas_entry(cpu_r, idx++);
620
621                 dst->data_type  = e->data_type;
622                 dst->nr_devs    = e->nr_devs;
623                 dst->nr_required = 1;
624                 memcpy(dst->devs, e->devs, e->nr_devs);
625                 replicas_entry_sort(dst);
626         }
627
628         return 0;
629 }
630
631 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
632 {
633         struct bch_sb_field_replicas *sb_v1;
634         struct bch_sb_field_replicas_v0 *sb_v0;
635         struct bch_replicas_cpu new_r = { 0, 0, NULL };
636         int ret = 0;
637
638         if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
639                 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
640         else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
641                 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
642
643         if (ret)
644                 return -ENOMEM;
645
646         bch2_cpu_replicas_sort(&new_r);
647
648         percpu_down_write(&c->mark_lock);
649
650         ret = replicas_table_update(c, &new_r);
651         percpu_up_write(&c->mark_lock);
652
653         kfree(new_r.entries);
654
655         return 0;
656 }
657
658 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
659                                                struct bch_replicas_cpu *r)
660 {
661         struct bch_sb_field_replicas_v0 *sb_r;
662         struct bch_replicas_entry_v0 *dst;
663         struct bch_replicas_entry *src;
664         size_t bytes;
665
666         bytes = sizeof(struct bch_sb_field_replicas);
667
668         for_each_cpu_replicas_entry(r, src)
669                 bytes += replicas_entry_bytes(src) - 1;
670
671         sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
672                         DIV_ROUND_UP(bytes, sizeof(u64)));
673         if (!sb_r)
674                 return -ENOSPC;
675
676         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
677         sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
678
679         memset(&sb_r->entries, 0,
680                vstruct_end(&sb_r->field) -
681                (void *) &sb_r->entries);
682
683         dst = sb_r->entries;
684         for_each_cpu_replicas_entry(r, src) {
685                 dst->data_type  = src->data_type;
686                 dst->nr_devs    = src->nr_devs;
687                 memcpy(dst->devs, src->devs, src->nr_devs);
688
689                 dst = replicas_entry_next(dst);
690
691                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
692         }
693
694         return 0;
695 }
696
697 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
698                                             struct bch_replicas_cpu *r)
699 {
700         struct bch_sb_field_replicas *sb_r;
701         struct bch_replicas_entry *dst, *src;
702         bool need_v1 = false;
703         size_t bytes;
704
705         bytes = sizeof(struct bch_sb_field_replicas);
706
707         for_each_cpu_replicas_entry(r, src) {
708                 bytes += replicas_entry_bytes(src);
709                 if (src->nr_required != 1)
710                         need_v1 = true;
711         }
712
713         if (!need_v1)
714                 return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
715
716         sb_r = bch2_sb_resize_replicas(&c->disk_sb,
717                         DIV_ROUND_UP(bytes, sizeof(u64)));
718         if (!sb_r)
719                 return -ENOSPC;
720
721         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
722         sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
723
724         memset(&sb_r->entries, 0,
725                vstruct_end(&sb_r->field) -
726                (void *) &sb_r->entries);
727
728         dst = sb_r->entries;
729         for_each_cpu_replicas_entry(r, src) {
730                 memcpy(dst, src, replicas_entry_bytes(src));
731
732                 dst = replicas_entry_next(dst);
733
734                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
735         }
736
737         return 0;
738 }
739
740 static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
741 {
742         unsigned i;
743
744         sort_cmp_size(cpu_r->entries,
745                       cpu_r->nr,
746                       cpu_r->entry_size,
747                       memcmp, NULL);
748
749         for (i = 0; i + 1 < cpu_r->nr; i++) {
750                 struct bch_replicas_entry *l =
751                         cpu_replicas_entry(cpu_r, i);
752                 struct bch_replicas_entry *r =
753                         cpu_replicas_entry(cpu_r, i + 1);
754
755                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
756
757                 if (!memcmp(l, r, cpu_r->entry_size))
758                         return "duplicate replicas entry";
759         }
760
761         return NULL;
762 }
763
764 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
765 {
766         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
767         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
768         struct bch_replicas_cpu cpu_r = { .entries = NULL };
769         struct bch_replicas_entry *e;
770         const char *err;
771         unsigned i;
772
773         for_each_replicas_entry(sb_r, e) {
774                 err = "invalid replicas entry: invalid data type";
775                 if (e->data_type >= BCH_DATA_NR)
776                         goto err;
777
778                 err = "invalid replicas entry: no devices";
779                 if (!e->nr_devs)
780                         goto err;
781
782                 err = "invalid replicas entry: bad nr_required";
783                 if (!e->nr_required ||
784                     (e->nr_required > 1 &&
785                      e->nr_required >= e->nr_devs))
786                         goto err;
787
788                 err = "invalid replicas entry: invalid device";
789                 for (i = 0; i < e->nr_devs; i++)
790                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
791                                 goto err;
792         }
793
794         err = "cannot allocate memory";
795         if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
796                 goto err;
797
798         err = check_dup_replicas_entries(&cpu_r);
799 err:
800         kfree(cpu_r.entries);
801         return err;
802 }
803
804 static void bch2_sb_replicas_to_text(struct printbuf *out,
805                                      struct bch_sb *sb,
806                                      struct bch_sb_field *f)
807 {
808         struct bch_sb_field_replicas *r = field_to_type(f, replicas);
809         struct bch_replicas_entry *e;
810         bool first = true;
811
812         for_each_replicas_entry(r, e) {
813                 if (!first)
814                         pr_buf(out, " ");
815                 first = false;
816
817                 bch2_replicas_entry_to_text(out, e);
818         }
819 }
820
821 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
822         .validate       = bch2_sb_validate_replicas,
823         .to_text        = bch2_sb_replicas_to_text,
824 };
825
826 static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
827 {
828         struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
829         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
830         struct bch_replicas_cpu cpu_r = { .entries = NULL };
831         struct bch_replicas_entry_v0 *e;
832         const char *err;
833         unsigned i;
834
835         for_each_replicas_entry_v0(sb_r, e) {
836                 err = "invalid replicas entry: invalid data type";
837                 if (e->data_type >= BCH_DATA_NR)
838                         goto err;
839
840                 err = "invalid replicas entry: no devices";
841                 if (!e->nr_devs)
842                         goto err;
843
844                 err = "invalid replicas entry: invalid device";
845                 for (i = 0; i < e->nr_devs; i++)
846                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
847                                 goto err;
848         }
849
850         err = "cannot allocate memory";
851         if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
852                 goto err;
853
854         err = check_dup_replicas_entries(&cpu_r);
855 err:
856         kfree(cpu_r.entries);
857         return err;
858 }
859
860 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
861         .validate       = bch2_sb_validate_replicas_v0,
862 };
863
864 /* Query replicas: */
865
866 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
867                                               struct bch_devs_mask online_devs)
868 {
869         struct bch_sb_field_members *mi;
870         struct bch_replicas_entry *e;
871         unsigned i, nr_online, nr_offline;
872         struct replicas_status ret;
873
874         memset(&ret, 0, sizeof(ret));
875
876         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
877                 ret.replicas[i].redundancy = INT_MAX;
878
879         mi = bch2_sb_get_members(c->disk_sb.sb);
880
881         percpu_down_read_preempt_disable(&c->mark_lock);
882
883         for_each_cpu_replicas_entry(&c->replicas, e) {
884                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
885                         panic("e %p data_type %u\n", e, e->data_type);
886
887                 nr_online = nr_offline = 0;
888
889                 for (i = 0; i < e->nr_devs; i++) {
890                         BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
891                                                 e->devs[i]));
892
893                         if (test_bit(e->devs[i], online_devs.d))
894                                 nr_online++;
895                         else
896                                 nr_offline++;
897                 }
898
899                 ret.replicas[e->data_type].redundancy =
900                         min(ret.replicas[e->data_type].redundancy,
901                             (int) nr_online - (int) e->nr_required);
902
903                 ret.replicas[e->data_type].nr_offline =
904                         max(ret.replicas[e->data_type].nr_offline,
905                             nr_offline);
906         }
907
908         percpu_up_read_preempt_enable(&c->mark_lock);
909
910         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
911                 if (ret.replicas[i].redundancy == INT_MAX)
912                         ret.replicas[i].redundancy = 0;
913
914         return ret;
915 }
916
917 struct replicas_status bch2_replicas_status(struct bch_fs *c)
918 {
919         return __bch2_replicas_status(c, bch2_online_devs(c));
920 }
921
922 static bool have_enough_devs(struct replicas_status s,
923                              enum bch_data_type type,
924                              bool force_if_degraded,
925                              bool force_if_lost)
926 {
927         return (!s.replicas[type].nr_offline || force_if_degraded) &&
928                 (s.replicas[type].redundancy >= 0 || force_if_lost);
929 }
930
931 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
932 {
933         return (have_enough_devs(s, BCH_DATA_JOURNAL,
934                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
935                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
936                 have_enough_devs(s, BCH_DATA_BTREE,
937                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
938                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
939                 have_enough_devs(s, BCH_DATA_USER,
940                                  flags & BCH_FORCE_IF_DATA_DEGRADED,
941                                  flags & BCH_FORCE_IF_DATA_LOST));
942 }
943
944 int bch2_replicas_online(struct bch_fs *c, bool meta)
945 {
946         struct replicas_status s = bch2_replicas_status(c);
947
948         return (meta
949                 ? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
950                       s.replicas[BCH_DATA_BTREE].redundancy)
951                 : s.replicas[BCH_DATA_USER].redundancy) + 1;
952 }
953
954 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
955 {
956         struct bch_replicas_entry *e;
957         unsigned i, ret = 0;
958
959         percpu_down_read_preempt_disable(&c->mark_lock);
960
961         for_each_cpu_replicas_entry(&c->replicas, e)
962                 for (i = 0; i < e->nr_devs; i++)
963                         if (e->devs[i] == ca->dev_idx)
964                                 ret |= 1 << e->data_type;
965
966         percpu_up_read_preempt_enable(&c->mark_lock);
967
968         return ret;
969 }
970
971 int bch2_fs_replicas_init(struct bch_fs *c)
972 {
973         c->journal.entry_u64s_reserved +=
974                 reserve_journal_replicas(c, &c->replicas);
975
976         return replicas_table_update(c, &c->replicas);
977 }