]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/replicas.c
Update bcachefs sources to b91a514413 bcachefs: Don't try to delete stripes when RO
[bcachefs-tools-debian] / libbcachefs / replicas.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8
9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
10                                             struct bch_replicas_cpu *);
11
12 /* Replicas tracking - in memory: */
13
14 static inline int u8_cmp(u8 l, u8 r)
15 {
16         return cmp_int(l, r);
17 }
18
19 static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
20 {
21 #ifdef CONFIG_BCACHES_DEBUG
22         unsigned i;
23
24         for (i = 0; i + 1 < e->nr_devs; i++)
25                 BUG_ON(e->devs[i] >= e->devs[i + 1]);
26 #endif
27 }
28
29 static void replicas_entry_sort(struct bch_replicas_entry *e)
30 {
31         bubble_sort(e->devs, e->nr_devs, u8_cmp);
32 }
33
34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
35 {
36         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
37 }
38
39 void bch2_replicas_entry_to_text(struct printbuf *out,
40                                  struct bch_replicas_entry *e)
41 {
42         unsigned i;
43
44         pr_buf(out, "%s: %u/%u [",
45                bch2_data_types[e->data_type],
46                e->nr_required,
47                e->nr_devs);
48
49         for (i = 0; i < e->nr_devs; i++)
50                 pr_buf(out, i ? " %u" : "%u", e->devs[i]);
51         pr_buf(out, "]");
52 }
53
54 void bch2_cpu_replicas_to_text(struct printbuf *out,
55                               struct bch_replicas_cpu *r)
56 {
57         struct bch_replicas_entry *e;
58         bool first = true;
59
60         for_each_cpu_replicas_entry(r, e) {
61                 if (!first)
62                         pr_buf(out, " ");
63                 first = false;
64
65                 bch2_replicas_entry_to_text(out, e);
66         }
67 }
68
69 static void extent_to_replicas(struct bkey_s_c k,
70                                struct bch_replicas_entry *r)
71 {
72         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
73         const union bch_extent_entry *entry;
74         struct extent_ptr_decoded p;
75
76         r->nr_required  = 1;
77
78         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
79                 if (p.ptr.cached)
80                         continue;
81
82                 if (p.ec_nr) {
83                         r->nr_devs = 0;
84                         break;
85                 }
86
87                 r->devs[r->nr_devs++] = p.ptr.dev;
88         }
89 }
90
91 static void stripe_to_replicas(struct bkey_s_c k,
92                                struct bch_replicas_entry *r)
93 {
94         struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
95         const struct bch_extent_ptr *ptr;
96
97         r->nr_required  = s.v->nr_blocks - s.v->nr_redundant;
98
99         for (ptr = s.v->ptrs;
100              ptr < s.v->ptrs + s.v->nr_blocks;
101              ptr++)
102                 r->devs[r->nr_devs++] = ptr->dev;
103 }
104
105 void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
106                            struct bkey_s_c k)
107 {
108         e->nr_devs = 0;
109
110         switch (k.k->type) {
111         case KEY_TYPE_btree_ptr:
112                 e->data_type = BCH_DATA_BTREE;
113                 extent_to_replicas(k, e);
114                 break;
115         case KEY_TYPE_extent:
116                 e->data_type = BCH_DATA_USER;
117                 extent_to_replicas(k, e);
118                 break;
119         case KEY_TYPE_stripe:
120                 e->data_type = BCH_DATA_USER;
121                 stripe_to_replicas(k, e);
122                 break;
123         }
124
125         replicas_entry_sort(e);
126 }
127
128 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
129                               enum bch_data_type data_type,
130                               struct bch_devs_list devs)
131 {
132         unsigned i;
133
134         BUG_ON(!data_type ||
135                data_type == BCH_DATA_SB ||
136                data_type >= BCH_DATA_NR);
137
138         e->data_type    = data_type;
139         e->nr_devs      = 0;
140         e->nr_required  = 1;
141
142         for (i = 0; i < devs.nr; i++)
143                 e->devs[e->nr_devs++] = devs.devs[i];
144
145         replicas_entry_sort(e);
146 }
147
148 static struct bch_replicas_cpu
149 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
150                        struct bch_replicas_entry *new_entry)
151 {
152         unsigned i;
153         struct bch_replicas_cpu new = {
154                 .nr             = old->nr + 1,
155                 .entry_size     = max_t(unsigned, old->entry_size,
156                                         replicas_entry_bytes(new_entry)),
157         };
158
159         BUG_ON(!new_entry->data_type);
160         verify_replicas_entry_sorted(new_entry);
161
162         new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
163         if (!new.entries)
164                 return new;
165
166         for (i = 0; i < old->nr; i++)
167                 memcpy(cpu_replicas_entry(&new, i),
168                        cpu_replicas_entry(old, i),
169                        old->entry_size);
170
171         memcpy(cpu_replicas_entry(&new, old->nr),
172                new_entry,
173                replicas_entry_bytes(new_entry));
174
175         bch2_cpu_replicas_sort(&new);
176         return new;
177 }
178
179 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
180                                        struct bch_replicas_entry *search)
181 {
182         int idx, entry_size = replicas_entry_bytes(search);
183
184         if (unlikely(entry_size > r->entry_size))
185                 return -1;
186
187         verify_replicas_entry_sorted(search);
188
189 #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
190         idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
191                               entry_cmp, search);
192 #undef entry_cmp
193
194         return idx < r->nr ? idx : -1;
195 }
196
197 int bch2_replicas_entry_idx(struct bch_fs *c,
198                             struct bch_replicas_entry *search)
199 {
200         replicas_entry_sort(search);
201
202         return __replicas_entry_idx(&c->replicas, search);
203 }
204
205 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
206                                  struct bch_replicas_entry *search)
207 {
208         return __replicas_entry_idx(r, search) >= 0;
209 }
210
211 static bool bch2_replicas_marked_locked(struct bch_fs *c,
212                           struct bch_replicas_entry *search,
213                           bool check_gc_replicas)
214 {
215         if (!search->nr_devs)
216                 return true;
217
218         verify_replicas_entry_sorted(search);
219
220         return __replicas_has_entry(&c->replicas, search) &&
221                 (!check_gc_replicas ||
222                  likely((!c->replicas_gc.entries)) ||
223                  __replicas_has_entry(&c->replicas_gc, search));
224 }
225
226 bool bch2_replicas_marked(struct bch_fs *c,
227                           struct bch_replicas_entry *search,
228                           bool check_gc_replicas)
229 {
230         bool marked;
231
232         percpu_down_read(&c->mark_lock);
233         marked = bch2_replicas_marked_locked(c, search, check_gc_replicas);
234         percpu_up_read(&c->mark_lock);
235
236         return marked;
237 }
238
239 static void __replicas_table_update(struct bch_fs_usage *dst,
240                                     struct bch_replicas_cpu *dst_r,
241                                     struct bch_fs_usage *src,
242                                     struct bch_replicas_cpu *src_r)
243 {
244         int src_idx, dst_idx;
245
246         *dst = *src;
247
248         for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
249                 if (!src->replicas[src_idx])
250                         continue;
251
252                 dst_idx = __replicas_entry_idx(dst_r,
253                                 cpu_replicas_entry(src_r, src_idx));
254                 BUG_ON(dst_idx < 0);
255
256                 dst->replicas[dst_idx] = src->replicas[src_idx];
257         }
258 }
259
260 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
261                                     struct bch_replicas_cpu *dst_r,
262                                     struct bch_fs_usage __percpu *src_p,
263                                     struct bch_replicas_cpu *src_r)
264 {
265         unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
266         struct bch_fs_usage *dst, *src = (void *)
267                 bch2_acc_percpu_u64s((void *) src_p, src_nr);
268
269         preempt_disable();
270         dst = this_cpu_ptr(dst_p);
271         preempt_enable();
272
273         __replicas_table_update(dst, dst_r, src, src_r);
274 }
275
276 /*
277  * Resize filesystem accounting:
278  */
279 static int replicas_table_update(struct bch_fs *c,
280                                  struct bch_replicas_cpu *new_r)
281 {
282         struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
283         struct bch_fs_usage *new_scratch = NULL;
284         struct bch_fs_usage __percpu *new_gc = NULL;
285         struct bch_fs_usage *new_base = NULL;
286         unsigned bytes = sizeof(struct bch_fs_usage) +
287                 sizeof(u64) * new_r->nr;
288         int ret = -ENOMEM;
289
290         if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
291             !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
292                                                 GFP_NOIO)) ||
293             !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
294                                                 GFP_NOIO)) ||
295             !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
296             (c->usage_gc &&
297              !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
298                 goto err;
299
300         if (c->usage_base)
301                 __replicas_table_update(new_base,               new_r,
302                                         c->usage_base,          &c->replicas);
303         if (c->usage[0])
304                 __replicas_table_update_pcpu(new_usage[0],      new_r,
305                                              c->usage[0],       &c->replicas);
306         if (c->usage[1])
307                 __replicas_table_update_pcpu(new_usage[1],      new_r,
308                                              c->usage[1],       &c->replicas);
309         if (c->usage_gc)
310                 __replicas_table_update_pcpu(new_gc,            new_r,
311                                              c->usage_gc,       &c->replicas);
312
313         swap(c->usage_base,     new_base);
314         swap(c->usage[0],       new_usage[0]);
315         swap(c->usage[1],       new_usage[1]);
316         swap(c->usage_scratch,  new_scratch);
317         swap(c->usage_gc,       new_gc);
318         swap(c->replicas,       *new_r);
319         ret = 0;
320 err:
321         free_percpu(new_gc);
322         kfree(new_scratch);
323         free_percpu(new_usage[1]);
324         free_percpu(new_usage[0]);
325         kfree(new_base);
326         return ret;
327 }
328
329 static unsigned reserve_journal_replicas(struct bch_fs *c,
330                                      struct bch_replicas_cpu *r)
331 {
332         struct bch_replicas_entry *e;
333         unsigned journal_res_u64s = 0;
334
335         /* nr_inodes: */
336         journal_res_u64s +=
337                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
338
339         /* key_version: */
340         journal_res_u64s +=
341                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
342
343         /* persistent_reserved: */
344         journal_res_u64s +=
345                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
346                 BCH_REPLICAS_MAX;
347
348         for_each_cpu_replicas_entry(r, e)
349                 journal_res_u64s +=
350                         DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
351                                      e->nr_devs, sizeof(u64));
352         return journal_res_u64s;
353 }
354
355 noinline
356 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
357                                 struct bch_replicas_entry *new_entry)
358 {
359         struct bch_replicas_cpu new_r, new_gc;
360         int ret = -ENOMEM;
361
362         memset(&new_r, 0, sizeof(new_r));
363         memset(&new_gc, 0, sizeof(new_gc));
364
365         mutex_lock(&c->sb_lock);
366
367         if (c->replicas_gc.entries &&
368             !__replicas_has_entry(&c->replicas_gc, new_entry)) {
369                 new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
370                 if (!new_gc.entries)
371                         goto err;
372         }
373
374         if (!__replicas_has_entry(&c->replicas, new_entry)) {
375                 new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
376                 if (!new_r.entries)
377                         goto err;
378
379                 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
380                 if (ret)
381                         goto err;
382
383                 bch2_journal_entry_res_resize(&c->journal,
384                                 &c->replicas_journal_res,
385                                 reserve_journal_replicas(c, &new_r));
386         }
387
388         if (!new_r.entries &&
389             !new_gc.entries)
390                 goto out;
391
392         /* allocations done, now commit: */
393
394         if (new_r.entries)
395                 bch2_write_super(c);
396
397         /* don't update in memory replicas until changes are persistent */
398         percpu_down_write(&c->mark_lock);
399         if (new_r.entries)
400                 ret = replicas_table_update(c, &new_r);
401         if (new_gc.entries)
402                 swap(new_gc, c->replicas_gc);
403         percpu_up_write(&c->mark_lock);
404 out:
405         ret = 0;
406 err:
407         mutex_unlock(&c->sb_lock);
408
409         kfree(new_r.entries);
410         kfree(new_gc.entries);
411
412         return ret;
413 }
414
415 int bch2_mark_replicas(struct bch_fs *c,
416                        struct bch_replicas_entry *r)
417 {
418         return likely(bch2_replicas_marked(c, r, true))
419                 ? 0
420                 : bch2_mark_replicas_slowpath(c, r);
421 }
422
423 bool bch2_bkey_replicas_marked_locked(struct bch_fs *c,
424                                       struct bkey_s_c k,
425                                       bool check_gc_replicas)
426 {
427         struct bch_replicas_padded search;
428         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
429         unsigned i;
430
431         for (i = 0; i < cached.nr; i++) {
432                 bch2_replicas_entry_cached(&search.e, cached.devs[i]);
433
434                 if (!bch2_replicas_marked_locked(c, &search.e,
435                                                  check_gc_replicas))
436                         return false;
437         }
438
439         bch2_bkey_to_replicas(&search.e, k);
440
441         return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas);
442 }
443
444 bool bch2_bkey_replicas_marked(struct bch_fs *c,
445                                struct bkey_s_c k,
446                                bool check_gc_replicas)
447 {
448         bool marked;
449
450         percpu_down_read(&c->mark_lock);
451         marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas);
452         percpu_up_read(&c->mark_lock);
453
454         return marked;
455 }
456
457 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
458 {
459         struct bch_replicas_padded search;
460         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
461         unsigned i;
462         int ret;
463
464         for (i = 0; i < cached.nr; i++) {
465                 bch2_replicas_entry_cached(&search.e, cached.devs[i]);
466
467                 ret = bch2_mark_replicas(c, &search.e);
468                 if (ret)
469                         return ret;
470         }
471
472         bch2_bkey_to_replicas(&search.e, k);
473
474         return bch2_mark_replicas(c, &search.e);
475 }
476
477 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
478 {
479         unsigned i;
480
481         lockdep_assert_held(&c->replicas_gc_lock);
482
483         mutex_lock(&c->sb_lock);
484         percpu_down_write(&c->mark_lock);
485
486         /*
487          * this is kind of crappy; the replicas gc mechanism needs to be ripped
488          * out
489          */
490
491         for (i = 0; i < c->replicas.nr; i++) {
492                 struct bch_replicas_entry *e =
493                         cpu_replicas_entry(&c->replicas, i);
494                 struct bch_replicas_cpu n;
495
496                 if (!__replicas_has_entry(&c->replicas_gc, e) &&
497                     (c->usage_base->replicas[i] ||
498                      percpu_u64_get(&c->usage[0]->replicas[i]) ||
499                      percpu_u64_get(&c->usage[1]->replicas[i]))) {
500                         n = cpu_replicas_add_entry(&c->replicas_gc, e);
501                         if (!n.entries) {
502                                 ret = -ENOSPC;
503                                 goto err;
504                         }
505
506                         swap(n, c->replicas_gc);
507                         kfree(n.entries);
508                 }
509         }
510
511         if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
512                 ret = -ENOSPC;
513                 goto err;
514         }
515
516         ret = replicas_table_update(c, &c->replicas_gc);
517 err:
518         kfree(c->replicas_gc.entries);
519         c->replicas_gc.entries = NULL;
520
521         percpu_up_write(&c->mark_lock);
522
523         if (!ret)
524                 bch2_write_super(c);
525
526         mutex_unlock(&c->sb_lock);
527
528         return ret;
529 }
530
531 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
532 {
533         struct bch_replicas_entry *e;
534         unsigned i = 0;
535
536         lockdep_assert_held(&c->replicas_gc_lock);
537
538         mutex_lock(&c->sb_lock);
539         BUG_ON(c->replicas_gc.entries);
540
541         c->replicas_gc.nr               = 0;
542         c->replicas_gc.entry_size       = 0;
543
544         for_each_cpu_replicas_entry(&c->replicas, e)
545                 if (!((1 << e->data_type) & typemask)) {
546                         c->replicas_gc.nr++;
547                         c->replicas_gc.entry_size =
548                                 max_t(unsigned, c->replicas_gc.entry_size,
549                                       replicas_entry_bytes(e));
550                 }
551
552         c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
553                                          c->replicas_gc.entry_size,
554                                          GFP_NOIO);
555         if (!c->replicas_gc.entries) {
556                 mutex_unlock(&c->sb_lock);
557                 return -ENOMEM;
558         }
559
560         for_each_cpu_replicas_entry(&c->replicas, e)
561                 if (!((1 << e->data_type) & typemask))
562                         memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
563                                e, c->replicas_gc.entry_size);
564
565         bch2_cpu_replicas_sort(&c->replicas_gc);
566         mutex_unlock(&c->sb_lock);
567
568         return 0;
569 }
570
571 int bch2_replicas_gc2(struct bch_fs *c)
572 {
573         struct bch_replicas_cpu new = { 0 };
574         unsigned i, nr;
575         int ret = 0;
576
577         bch2_journal_meta(&c->journal);
578 retry:
579         nr              = READ_ONCE(c->replicas.nr);
580         new.entry_size  = READ_ONCE(c->replicas.entry_size);
581         new.entries     = kcalloc(nr, new.entry_size, GFP_KERNEL);
582         if (!new.entries)
583                 return -ENOMEM;
584
585         mutex_lock(&c->sb_lock);
586         percpu_down_write(&c->mark_lock);
587
588         if (nr                  != c->replicas.nr ||
589             new.entry_size      != c->replicas.entry_size) {
590                 percpu_up_write(&c->mark_lock);
591                 mutex_unlock(&c->sb_lock);
592                 kfree(new.entries);
593                 goto retry;
594         }
595
596         for (i = 0; i < c->replicas.nr; i++) {
597                 struct bch_replicas_entry *e =
598                         cpu_replicas_entry(&c->replicas, i);
599
600                 if (e->data_type == BCH_DATA_JOURNAL ||
601                     c->usage_base->replicas[i] ||
602                     percpu_u64_get(&c->usage[0]->replicas[i]) ||
603                     percpu_u64_get(&c->usage[1]->replicas[i]))
604                         memcpy(cpu_replicas_entry(&new, new.nr++),
605                                e, new.entry_size);
606         }
607
608         bch2_cpu_replicas_sort(&new);
609
610         if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
611                 ret = -ENOSPC;
612                 goto err;
613         }
614
615         ret = replicas_table_update(c, &new);
616 err:
617         kfree(new.entries);
618
619         percpu_up_write(&c->mark_lock);
620
621         if (!ret)
622                 bch2_write_super(c);
623
624         mutex_unlock(&c->sb_lock);
625
626         return ret;
627 }
628
629 int bch2_replicas_set_usage(struct bch_fs *c,
630                             struct bch_replicas_entry *r,
631                             u64 sectors)
632 {
633         int ret, idx = bch2_replicas_entry_idx(c, r);
634
635         if (idx < 0) {
636                 struct bch_replicas_cpu n;
637
638                 n = cpu_replicas_add_entry(&c->replicas, r);
639                 if (!n.entries)
640                         return -ENOMEM;
641
642                 ret = replicas_table_update(c, &n);
643                 if (ret)
644                         return ret;
645
646                 kfree(n.entries);
647
648                 idx = bch2_replicas_entry_idx(c, r);
649                 BUG_ON(ret < 0);
650         }
651
652         c->usage_base->replicas[idx] = sectors;
653
654         return 0;
655 }
656
657 /* Replicas tracking - superblock: */
658
659 static int
660 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
661                                    struct bch_replicas_cpu *cpu_r)
662 {
663         struct bch_replicas_entry *e, *dst;
664         unsigned nr = 0, entry_size = 0, idx = 0;
665
666         for_each_replicas_entry(sb_r, e) {
667                 entry_size = max_t(unsigned, entry_size,
668                                    replicas_entry_bytes(e));
669                 nr++;
670         }
671
672         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
673         if (!cpu_r->entries)
674                 return -ENOMEM;
675
676         cpu_r->nr               = nr;
677         cpu_r->entry_size       = entry_size;
678
679         for_each_replicas_entry(sb_r, e) {
680                 dst = cpu_replicas_entry(cpu_r, idx++);
681                 memcpy(dst, e, replicas_entry_bytes(e));
682                 replicas_entry_sort(dst);
683         }
684
685         return 0;
686 }
687
688 static int
689 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
690                                       struct bch_replicas_cpu *cpu_r)
691 {
692         struct bch_replicas_entry_v0 *e;
693         unsigned nr = 0, entry_size = 0, idx = 0;
694
695         for_each_replicas_entry(sb_r, e) {
696                 entry_size = max_t(unsigned, entry_size,
697                                    replicas_entry_bytes(e));
698                 nr++;
699         }
700
701         entry_size += sizeof(struct bch_replicas_entry) -
702                 sizeof(struct bch_replicas_entry_v0);
703
704         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
705         if (!cpu_r->entries)
706                 return -ENOMEM;
707
708         cpu_r->nr               = nr;
709         cpu_r->entry_size       = entry_size;
710
711         for_each_replicas_entry(sb_r, e) {
712                 struct bch_replicas_entry *dst =
713                         cpu_replicas_entry(cpu_r, idx++);
714
715                 dst->data_type  = e->data_type;
716                 dst->nr_devs    = e->nr_devs;
717                 dst->nr_required = 1;
718                 memcpy(dst->devs, e->devs, e->nr_devs);
719                 replicas_entry_sort(dst);
720         }
721
722         return 0;
723 }
724
725 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
726 {
727         struct bch_sb_field_replicas *sb_v1;
728         struct bch_sb_field_replicas_v0 *sb_v0;
729         struct bch_replicas_cpu new_r = { 0, 0, NULL };
730         int ret = 0;
731
732         if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
733                 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
734         else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
735                 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
736
737         if (ret)
738                 return -ENOMEM;
739
740         bch2_cpu_replicas_sort(&new_r);
741
742         percpu_down_write(&c->mark_lock);
743
744         ret = replicas_table_update(c, &new_r);
745         percpu_up_write(&c->mark_lock);
746
747         kfree(new_r.entries);
748
749         return 0;
750 }
751
752 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
753                                                struct bch_replicas_cpu *r)
754 {
755         struct bch_sb_field_replicas_v0 *sb_r;
756         struct bch_replicas_entry_v0 *dst;
757         struct bch_replicas_entry *src;
758         size_t bytes;
759
760         bytes = sizeof(struct bch_sb_field_replicas);
761
762         for_each_cpu_replicas_entry(r, src)
763                 bytes += replicas_entry_bytes(src) - 1;
764
765         sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
766                         DIV_ROUND_UP(bytes, sizeof(u64)));
767         if (!sb_r)
768                 return -ENOSPC;
769
770         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
771         sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
772
773         memset(&sb_r->entries, 0,
774                vstruct_end(&sb_r->field) -
775                (void *) &sb_r->entries);
776
777         dst = sb_r->entries;
778         for_each_cpu_replicas_entry(r, src) {
779                 dst->data_type  = src->data_type;
780                 dst->nr_devs    = src->nr_devs;
781                 memcpy(dst->devs, src->devs, src->nr_devs);
782
783                 dst = replicas_entry_next(dst);
784
785                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
786         }
787
788         return 0;
789 }
790
791 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
792                                             struct bch_replicas_cpu *r)
793 {
794         struct bch_sb_field_replicas *sb_r;
795         struct bch_replicas_entry *dst, *src;
796         bool need_v1 = false;
797         size_t bytes;
798
799         bytes = sizeof(struct bch_sb_field_replicas);
800
801         for_each_cpu_replicas_entry(r, src) {
802                 bytes += replicas_entry_bytes(src);
803                 if (src->nr_required != 1)
804                         need_v1 = true;
805         }
806
807         if (!need_v1)
808                 return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
809
810         sb_r = bch2_sb_resize_replicas(&c->disk_sb,
811                         DIV_ROUND_UP(bytes, sizeof(u64)));
812         if (!sb_r)
813                 return -ENOSPC;
814
815         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
816         sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
817
818         memset(&sb_r->entries, 0,
819                vstruct_end(&sb_r->field) -
820                (void *) &sb_r->entries);
821
822         dst = sb_r->entries;
823         for_each_cpu_replicas_entry(r, src) {
824                 memcpy(dst, src, replicas_entry_bytes(src));
825
826                 dst = replicas_entry_next(dst);
827
828                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
829         }
830
831         return 0;
832 }
833
834 static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
835 {
836         unsigned i;
837
838         sort_cmp_size(cpu_r->entries,
839                       cpu_r->nr,
840                       cpu_r->entry_size,
841                       memcmp, NULL);
842
843         for (i = 0; i + 1 < cpu_r->nr; i++) {
844                 struct bch_replicas_entry *l =
845                         cpu_replicas_entry(cpu_r, i);
846                 struct bch_replicas_entry *r =
847                         cpu_replicas_entry(cpu_r, i + 1);
848
849                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
850
851                 if (!memcmp(l, r, cpu_r->entry_size))
852                         return "duplicate replicas entry";
853         }
854
855         return NULL;
856 }
857
858 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
859 {
860         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
861         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
862         struct bch_replicas_cpu cpu_r = { .entries = NULL };
863         struct bch_replicas_entry *e;
864         const char *err;
865         unsigned i;
866
867         for_each_replicas_entry(sb_r, e) {
868                 err = "invalid replicas entry: invalid data type";
869                 if (e->data_type >= BCH_DATA_NR)
870                         goto err;
871
872                 err = "invalid replicas entry: no devices";
873                 if (!e->nr_devs)
874                         goto err;
875
876                 err = "invalid replicas entry: bad nr_required";
877                 if (!e->nr_required ||
878                     (e->nr_required > 1 &&
879                      e->nr_required >= e->nr_devs))
880                         goto err;
881
882                 err = "invalid replicas entry: invalid device";
883                 for (i = 0; i < e->nr_devs; i++)
884                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
885                                 goto err;
886         }
887
888         err = "cannot allocate memory";
889         if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
890                 goto err;
891
892         err = check_dup_replicas_entries(&cpu_r);
893 err:
894         kfree(cpu_r.entries);
895         return err;
896 }
897
898 static void bch2_sb_replicas_to_text(struct printbuf *out,
899                                      struct bch_sb *sb,
900                                      struct bch_sb_field *f)
901 {
902         struct bch_sb_field_replicas *r = field_to_type(f, replicas);
903         struct bch_replicas_entry *e;
904         bool first = true;
905
906         for_each_replicas_entry(r, e) {
907                 if (!first)
908                         pr_buf(out, " ");
909                 first = false;
910
911                 bch2_replicas_entry_to_text(out, e);
912         }
913 }
914
915 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
916         .validate       = bch2_sb_validate_replicas,
917         .to_text        = bch2_sb_replicas_to_text,
918 };
919
920 static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
921 {
922         struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
923         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
924         struct bch_replicas_cpu cpu_r = { .entries = NULL };
925         struct bch_replicas_entry_v0 *e;
926         const char *err;
927         unsigned i;
928
929         for_each_replicas_entry_v0(sb_r, e) {
930                 err = "invalid replicas entry: invalid data type";
931                 if (e->data_type >= BCH_DATA_NR)
932                         goto err;
933
934                 err = "invalid replicas entry: no devices";
935                 if (!e->nr_devs)
936                         goto err;
937
938                 err = "invalid replicas entry: invalid device";
939                 for (i = 0; i < e->nr_devs; i++)
940                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
941                                 goto err;
942         }
943
944         err = "cannot allocate memory";
945         if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
946                 goto err;
947
948         err = check_dup_replicas_entries(&cpu_r);
949 err:
950         kfree(cpu_r.entries);
951         return err;
952 }
953
954 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
955         .validate       = bch2_sb_validate_replicas_v0,
956 };
957
958 /* Query replicas: */
959
960 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
961                                               struct bch_devs_mask online_devs)
962 {
963         struct bch_sb_field_members *mi;
964         struct bch_replicas_entry *e;
965         unsigned i, nr_online, nr_offline;
966         struct replicas_status ret;
967
968         memset(&ret, 0, sizeof(ret));
969
970         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
971                 ret.replicas[i].redundancy = INT_MAX;
972
973         mi = bch2_sb_get_members(c->disk_sb.sb);
974
975         percpu_down_read(&c->mark_lock);
976
977         for_each_cpu_replicas_entry(&c->replicas, e) {
978                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
979                         panic("e %p data_type %u\n", e, e->data_type);
980
981                 nr_online = nr_offline = 0;
982
983                 for (i = 0; i < e->nr_devs; i++) {
984                         BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
985                                                 e->devs[i]));
986
987                         if (test_bit(e->devs[i], online_devs.d))
988                                 nr_online++;
989                         else
990                                 nr_offline++;
991                 }
992
993                 ret.replicas[e->data_type].redundancy =
994                         min(ret.replicas[e->data_type].redundancy,
995                             (int) nr_online - (int) e->nr_required);
996
997                 ret.replicas[e->data_type].nr_offline =
998                         max(ret.replicas[e->data_type].nr_offline,
999                             nr_offline);
1000         }
1001
1002         percpu_up_read(&c->mark_lock);
1003
1004         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1005                 if (ret.replicas[i].redundancy == INT_MAX)
1006                         ret.replicas[i].redundancy = 0;
1007
1008         return ret;
1009 }
1010
1011 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1012 {
1013         return __bch2_replicas_status(c, bch2_online_devs(c));
1014 }
1015
1016 static bool have_enough_devs(struct replicas_status s,
1017                              enum bch_data_type type,
1018                              bool force_if_degraded,
1019                              bool force_if_lost)
1020 {
1021         return (!s.replicas[type].nr_offline || force_if_degraded) &&
1022                 (s.replicas[type].redundancy >= 0 || force_if_lost);
1023 }
1024
1025 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
1026 {
1027         return (have_enough_devs(s, BCH_DATA_JOURNAL,
1028                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1029                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1030                 have_enough_devs(s, BCH_DATA_BTREE,
1031                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1032                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1033                 have_enough_devs(s, BCH_DATA_USER,
1034                                  flags & BCH_FORCE_IF_DATA_DEGRADED,
1035                                  flags & BCH_FORCE_IF_DATA_LOST));
1036 }
1037
1038 int bch2_replicas_online(struct bch_fs *c, bool meta)
1039 {
1040         struct replicas_status s = bch2_replicas_status(c);
1041
1042         return (meta
1043                 ? min(s.replicas[BCH_DATA_JOURNAL].redundancy,
1044                       s.replicas[BCH_DATA_BTREE].redundancy)
1045                 : s.replicas[BCH_DATA_USER].redundancy) + 1;
1046 }
1047
1048 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1049 {
1050         struct bch_replicas_entry *e;
1051         unsigned i, ret = 0;
1052
1053         percpu_down_read(&c->mark_lock);
1054
1055         for_each_cpu_replicas_entry(&c->replicas, e)
1056                 for (i = 0; i < e->nr_devs; i++)
1057                         if (e->devs[i] == ca->dev_idx)
1058                                 ret |= 1 << e->data_type;
1059
1060         percpu_up_read(&c->mark_lock);
1061
1062         return ret;
1063 }
1064
1065 int bch2_fs_replicas_init(struct bch_fs *c)
1066 {
1067         c->journal.entry_u64s_reserved +=
1068                 reserve_journal_replicas(c, &c->replicas);
1069
1070         return replicas_table_update(c, &c->replicas);
1071 }