]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/replicas.c
Update bcachefs sources to 93347f7162 bcachefs: Add btree node prefetching to bch2_bt...
[bcachefs-tools-debian] / libbcachefs / replicas.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "buckets.h"
5 #include "journal.h"
6 #include "replicas.h"
7 #include "super-io.h"
8
9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
10                                             struct bch_replicas_cpu *);
11
12 /* Replicas tracking - in memory: */
13
14 static void verify_replicas_entry(struct bch_replicas_entry *e)
15 {
16 #ifdef CONFIG_BCACHEFS_DEBUG
17         unsigned i;
18
19         BUG_ON(e->data_type >= BCH_DATA_NR);
20         BUG_ON(!e->nr_devs);
21         BUG_ON(e->nr_required > 1 &&
22                e->nr_required >= e->nr_devs);
23
24         for (i = 0; i + 1 < e->nr_devs; i++)
25                 BUG_ON(e->devs[i] >= e->devs[i + 1]);
26 #endif
27 }
28
29 static void replicas_entry_sort(struct bch_replicas_entry *e)
30 {
31         bubble_sort(e->devs, e->nr_devs, u8_cmp);
32 }
33
34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
35 {
36         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
37 }
38
39 void bch2_replicas_entry_to_text(struct printbuf *out,
40                                  struct bch_replicas_entry *e)
41 {
42         unsigned i;
43
44         pr_buf(out, "%s: %u/%u [",
45                bch2_data_types[e->data_type],
46                e->nr_required,
47                e->nr_devs);
48
49         for (i = 0; i < e->nr_devs; i++)
50                 pr_buf(out, i ? " %u" : "%u", e->devs[i]);
51         pr_buf(out, "]");
52 }
53
54 void bch2_cpu_replicas_to_text(struct printbuf *out,
55                               struct bch_replicas_cpu *r)
56 {
57         struct bch_replicas_entry *e;
58         bool first = true;
59
60         for_each_cpu_replicas_entry(r, e) {
61                 if (!first)
62                         pr_buf(out, " ");
63                 first = false;
64
65                 bch2_replicas_entry_to_text(out, e);
66         }
67 }
68
69 static void extent_to_replicas(struct bkey_s_c k,
70                                struct bch_replicas_entry *r)
71 {
72         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
73         const union bch_extent_entry *entry;
74         struct extent_ptr_decoded p;
75
76         r->nr_required  = 1;
77
78         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
79                 if (p.ptr.cached)
80                         continue;
81
82                 if (!p.has_ec)
83                         r->devs[r->nr_devs++] = p.ptr.dev;
84                 else
85                         r->nr_required = 0;
86         }
87 }
88
89 static void stripe_to_replicas(struct bkey_s_c k,
90                                struct bch_replicas_entry *r)
91 {
92         struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
93         const struct bch_extent_ptr *ptr;
94
95         r->nr_required  = s.v->nr_blocks - s.v->nr_redundant;
96
97         for (ptr = s.v->ptrs;
98              ptr < s.v->ptrs + s.v->nr_blocks;
99              ptr++)
100                 r->devs[r->nr_devs++] = ptr->dev;
101 }
102
103 void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
104                            struct bkey_s_c k)
105 {
106         e->nr_devs = 0;
107
108         switch (k.k->type) {
109         case KEY_TYPE_btree_ptr:
110         case KEY_TYPE_btree_ptr_v2:
111                 e->data_type = BCH_DATA_btree;
112                 extent_to_replicas(k, e);
113                 break;
114         case KEY_TYPE_extent:
115         case KEY_TYPE_reflink_v:
116                 e->data_type = BCH_DATA_user;
117                 extent_to_replicas(k, e);
118                 break;
119         case KEY_TYPE_stripe:
120                 e->data_type = BCH_DATA_parity;
121                 stripe_to_replicas(k, e);
122                 break;
123         }
124
125         replicas_entry_sort(e);
126 }
127
128 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
129                               enum bch_data_type data_type,
130                               struct bch_devs_list devs)
131 {
132         unsigned i;
133
134         BUG_ON(!data_type ||
135                data_type == BCH_DATA_sb ||
136                data_type >= BCH_DATA_NR);
137
138         e->data_type    = data_type;
139         e->nr_devs      = 0;
140         e->nr_required  = 1;
141
142         for (i = 0; i < devs.nr; i++)
143                 e->devs[e->nr_devs++] = devs.devs[i];
144
145         replicas_entry_sort(e);
146 }
147
148 static struct bch_replicas_cpu
149 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
150                        struct bch_replicas_entry *new_entry)
151 {
152         unsigned i;
153         struct bch_replicas_cpu new = {
154                 .nr             = old->nr + 1,
155                 .entry_size     = max_t(unsigned, old->entry_size,
156                                         replicas_entry_bytes(new_entry)),
157         };
158
159         BUG_ON(!new_entry->data_type);
160         verify_replicas_entry(new_entry);
161
162         new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
163         if (!new.entries)
164                 return new;
165
166         for (i = 0; i < old->nr; i++)
167                 memcpy(cpu_replicas_entry(&new, i),
168                        cpu_replicas_entry(old, i),
169                        old->entry_size);
170
171         memcpy(cpu_replicas_entry(&new, old->nr),
172                new_entry,
173                replicas_entry_bytes(new_entry));
174
175         bch2_cpu_replicas_sort(&new);
176         return new;
177 }
178
179 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
180                                        struct bch_replicas_entry *search)
181 {
182         int idx, entry_size = replicas_entry_bytes(search);
183
184         if (unlikely(entry_size > r->entry_size))
185                 return -1;
186
187         verify_replicas_entry(search);
188
189 #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
190         idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
191                               entry_cmp, search);
192 #undef entry_cmp
193
194         return idx < r->nr ? idx : -1;
195 }
196
197 int bch2_replicas_entry_idx(struct bch_fs *c,
198                             struct bch_replicas_entry *search)
199 {
200         replicas_entry_sort(search);
201
202         return __replicas_entry_idx(&c->replicas, search);
203 }
204
205 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
206                                  struct bch_replicas_entry *search)
207 {
208         return __replicas_entry_idx(r, search) >= 0;
209 }
210
211 bool bch2_replicas_marked(struct bch_fs *c,
212                           struct bch_replicas_entry *search)
213 {
214         bool marked;
215
216         if (!search->nr_devs)
217                 return true;
218
219         verify_replicas_entry(search);
220
221         percpu_down_read(&c->mark_lock);
222         marked = __replicas_has_entry(&c->replicas, search) &&
223                 (likely((!c->replicas_gc.entries)) ||
224                  __replicas_has_entry(&c->replicas_gc, search));
225         percpu_up_read(&c->mark_lock);
226
227         return marked;
228 }
229
230 static void __replicas_table_update(struct bch_fs_usage *dst,
231                                     struct bch_replicas_cpu *dst_r,
232                                     struct bch_fs_usage *src,
233                                     struct bch_replicas_cpu *src_r)
234 {
235         int src_idx, dst_idx;
236
237         *dst = *src;
238
239         for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
240                 if (!src->replicas[src_idx])
241                         continue;
242
243                 dst_idx = __replicas_entry_idx(dst_r,
244                                 cpu_replicas_entry(src_r, src_idx));
245                 BUG_ON(dst_idx < 0);
246
247                 dst->replicas[dst_idx] = src->replicas[src_idx];
248         }
249 }
250
251 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
252                                     struct bch_replicas_cpu *dst_r,
253                                     struct bch_fs_usage __percpu *src_p,
254                                     struct bch_replicas_cpu *src_r)
255 {
256         unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
257         struct bch_fs_usage *dst, *src = (void *)
258                 bch2_acc_percpu_u64s((void *) src_p, src_nr);
259
260         preempt_disable();
261         dst = this_cpu_ptr(dst_p);
262         preempt_enable();
263
264         __replicas_table_update(dst, dst_r, src, src_r);
265 }
266
267 /*
268  * Resize filesystem accounting:
269  */
270 static int replicas_table_update(struct bch_fs *c,
271                                  struct bch_replicas_cpu *new_r)
272 {
273         struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
274         struct bch_fs_usage *new_scratch = NULL;
275         struct bch_fs_usage __percpu *new_gc = NULL;
276         struct bch_fs_usage *new_base = NULL;
277         unsigned i, bytes = sizeof(struct bch_fs_usage) +
278                 sizeof(u64) * new_r->nr;
279         int ret = 0;
280
281         memset(new_usage, 0, sizeof(new_usage));
282
283         for (i = 0; i < ARRAY_SIZE(new_usage); i++)
284                 if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
285                                         sizeof(u64), GFP_NOIO)))
286                         goto err;
287
288         if (!(new_base = kzalloc(bytes, GFP_NOIO)) ||
289             !(new_scratch  = kmalloc(bytes, GFP_NOIO)) ||
290             (c->usage_gc &&
291              !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
292                 goto err;
293
294         for (i = 0; i < ARRAY_SIZE(new_usage); i++)
295                 if (c->usage[i])
296                         __replicas_table_update_pcpu(new_usage[i], new_r,
297                                                      c->usage[i], &c->replicas);
298         if (c->usage_base)
299                 __replicas_table_update(new_base,               new_r,
300                                         c->usage_base,          &c->replicas);
301         if (c->usage_gc)
302                 __replicas_table_update_pcpu(new_gc,            new_r,
303                                              c->usage_gc,       &c->replicas);
304
305         for (i = 0; i < ARRAY_SIZE(new_usage); i++)
306                 swap(c->usage[i],       new_usage[i]);
307         swap(c->usage_base,     new_base);
308         swap(c->usage_scratch,  new_scratch);
309         swap(c->usage_gc,       new_gc);
310         swap(c->replicas,       *new_r);
311 out:
312         free_percpu(new_gc);
313         kfree(new_scratch);
314         free_percpu(new_usage[1]);
315         free_percpu(new_usage[0]);
316         kfree(new_base);
317         return ret;
318 err:
319         bch_err(c, "error updating replicas table: memory allocation failure");
320         ret = -ENOMEM;
321         goto out;
322 }
323
324 static unsigned reserve_journal_replicas(struct bch_fs *c,
325                                      struct bch_replicas_cpu *r)
326 {
327         struct bch_replicas_entry *e;
328         unsigned journal_res_u64s = 0;
329
330         /* nr_inodes: */
331         journal_res_u64s +=
332                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
333
334         /* key_version: */
335         journal_res_u64s +=
336                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
337
338         /* persistent_reserved: */
339         journal_res_u64s +=
340                 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
341                 BCH_REPLICAS_MAX;
342
343         for_each_cpu_replicas_entry(r, e)
344                 journal_res_u64s +=
345                         DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
346                                      e->nr_devs, sizeof(u64));
347         return journal_res_u64s;
348 }
349
350 noinline
351 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
352                                 struct bch_replicas_entry *new_entry)
353 {
354         struct bch_replicas_cpu new_r, new_gc;
355         int ret = 0;
356
357         verify_replicas_entry(new_entry);
358
359         memset(&new_r, 0, sizeof(new_r));
360         memset(&new_gc, 0, sizeof(new_gc));
361
362         mutex_lock(&c->sb_lock);
363
364         if (c->replicas_gc.entries &&
365             !__replicas_has_entry(&c->replicas_gc, new_entry)) {
366                 new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
367                 if (!new_gc.entries)
368                         goto err;
369         }
370
371         if (!__replicas_has_entry(&c->replicas, new_entry)) {
372                 new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
373                 if (!new_r.entries)
374                         goto err;
375
376                 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
377                 if (ret)
378                         goto err;
379
380                 bch2_journal_entry_res_resize(&c->journal,
381                                 &c->replicas_journal_res,
382                                 reserve_journal_replicas(c, &new_r));
383         }
384
385         if (!new_r.entries &&
386             !new_gc.entries)
387                 goto out;
388
389         /* allocations done, now commit: */
390
391         if (new_r.entries)
392                 bch2_write_super(c);
393
394         /* don't update in memory replicas until changes are persistent */
395         percpu_down_write(&c->mark_lock);
396         if (new_r.entries)
397                 ret = replicas_table_update(c, &new_r);
398         if (new_gc.entries)
399                 swap(new_gc, c->replicas_gc);
400         percpu_up_write(&c->mark_lock);
401 out:
402         mutex_unlock(&c->sb_lock);
403
404         kfree(new_r.entries);
405         kfree(new_gc.entries);
406
407         return ret;
408 err:
409         bch_err(c, "error adding replicas entry: memory allocation failure");
410         ret = -ENOMEM;
411         goto out;
412 }
413
414 static int __bch2_mark_replicas(struct bch_fs *c,
415                                 struct bch_replicas_entry *r,
416                                 bool check)
417 {
418         return likely(bch2_replicas_marked(c, r))       ? 0
419                 : check                                 ? -1
420                 : bch2_mark_replicas_slowpath(c, r);
421 }
422
423 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
424 {
425         return __bch2_mark_replicas(c, r, false);
426 }
427
428 static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
429                                      bool check)
430 {
431         struct bch_replicas_padded search;
432         struct bch_devs_list cached = bch2_bkey_cached_devs(k);
433         unsigned i;
434         int ret;
435
436         for (i = 0; i < cached.nr; i++) {
437                 bch2_replicas_entry_cached(&search.e, cached.devs[i]);
438
439                 ret = __bch2_mark_replicas(c, &search.e, check);
440                 if (ret)
441                         return ret;
442         }
443
444         bch2_bkey_to_replicas(&search.e, k);
445
446         ret = __bch2_mark_replicas(c, &search.e, check);
447         if (ret)
448                 return ret;
449
450         if (search.e.data_type == BCH_DATA_parity) {
451                 search.e.data_type = BCH_DATA_cached;
452                 ret = __bch2_mark_replicas(c, &search.e, check);
453                 if (ret)
454                         return ret;
455
456                 search.e.data_type = BCH_DATA_user;
457                 ret = __bch2_mark_replicas(c, &search.e, check);
458                 if (ret)
459                         return ret;
460         }
461
462         return 0;
463 }
464
465 bool bch2_bkey_replicas_marked(struct bch_fs *c,
466                                struct bkey_s_c k)
467 {
468         return __bch2_mark_bkey_replicas(c, k, true) == 0;
469 }
470
471 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
472 {
473         return __bch2_mark_bkey_replicas(c, k, false);
474 }
475
476 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
477 {
478         unsigned i;
479
480         lockdep_assert_held(&c->replicas_gc_lock);
481
482         mutex_lock(&c->sb_lock);
483         percpu_down_write(&c->mark_lock);
484
485         /*
486          * this is kind of crappy; the replicas gc mechanism needs to be ripped
487          * out
488          */
489
490         for (i = 0; i < c->replicas.nr; i++) {
491                 struct bch_replicas_entry *e =
492                         cpu_replicas_entry(&c->replicas, i);
493                 struct bch_replicas_cpu n;
494
495                 if (!__replicas_has_entry(&c->replicas_gc, e) &&
496                     bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
497                         n = cpu_replicas_add_entry(&c->replicas_gc, e);
498                         if (!n.entries) {
499                                 ret = -ENOSPC;
500                                 goto err;
501                         }
502
503                         swap(n, c->replicas_gc);
504                         kfree(n.entries);
505                 }
506         }
507
508         if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
509                 ret = -ENOSPC;
510                 goto err;
511         }
512
513         ret = replicas_table_update(c, &c->replicas_gc);
514 err:
515         kfree(c->replicas_gc.entries);
516         c->replicas_gc.entries = NULL;
517
518         percpu_up_write(&c->mark_lock);
519
520         if (!ret)
521                 bch2_write_super(c);
522
523         mutex_unlock(&c->sb_lock);
524
525         return ret;
526 }
527
528 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
529 {
530         struct bch_replicas_entry *e;
531         unsigned i = 0;
532
533         lockdep_assert_held(&c->replicas_gc_lock);
534
535         mutex_lock(&c->sb_lock);
536         BUG_ON(c->replicas_gc.entries);
537
538         c->replicas_gc.nr               = 0;
539         c->replicas_gc.entry_size       = 0;
540
541         for_each_cpu_replicas_entry(&c->replicas, e)
542                 if (!((1 << e->data_type) & typemask)) {
543                         c->replicas_gc.nr++;
544                         c->replicas_gc.entry_size =
545                                 max_t(unsigned, c->replicas_gc.entry_size,
546                                       replicas_entry_bytes(e));
547                 }
548
549         c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
550                                          c->replicas_gc.entry_size,
551                                          GFP_NOIO);
552         if (!c->replicas_gc.entries) {
553                 mutex_unlock(&c->sb_lock);
554                 bch_err(c, "error allocating c->replicas_gc");
555                 return -ENOMEM;
556         }
557
558         for_each_cpu_replicas_entry(&c->replicas, e)
559                 if (!((1 << e->data_type) & typemask))
560                         memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
561                                e, c->replicas_gc.entry_size);
562
563         bch2_cpu_replicas_sort(&c->replicas_gc);
564         mutex_unlock(&c->sb_lock);
565
566         return 0;
567 }
568
569 int bch2_replicas_gc2(struct bch_fs *c)
570 {
571         struct bch_replicas_cpu new = { 0 };
572         unsigned i, nr;
573         int ret = 0;
574
575         bch2_journal_meta(&c->journal);
576 retry:
577         nr              = READ_ONCE(c->replicas.nr);
578         new.entry_size  = READ_ONCE(c->replicas.entry_size);
579         new.entries     = kcalloc(nr, new.entry_size, GFP_KERNEL);
580         if (!new.entries) {
581                 bch_err(c, "error allocating c->replicas_gc");
582                 return -ENOMEM;
583         }
584
585         mutex_lock(&c->sb_lock);
586         percpu_down_write(&c->mark_lock);
587
588         if (nr                  != c->replicas.nr ||
589             new.entry_size      != c->replicas.entry_size) {
590                 percpu_up_write(&c->mark_lock);
591                 mutex_unlock(&c->sb_lock);
592                 kfree(new.entries);
593                 goto retry;
594         }
595
596         for (i = 0; i < c->replicas.nr; i++) {
597                 struct bch_replicas_entry *e =
598                         cpu_replicas_entry(&c->replicas, i);
599
600                 if (e->data_type == BCH_DATA_journal ||
601                     c->usage_base->replicas[i] ||
602                     percpu_u64_get(&c->usage[0]->replicas[i]) ||
603                     percpu_u64_get(&c->usage[1]->replicas[i]) ||
604                     percpu_u64_get(&c->usage[2]->replicas[i]) ||
605                     percpu_u64_get(&c->usage[3]->replicas[i]))
606                         memcpy(cpu_replicas_entry(&new, new.nr++),
607                                e, new.entry_size);
608         }
609
610         bch2_cpu_replicas_sort(&new);
611
612         if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
613                 ret = -ENOSPC;
614                 goto err;
615         }
616
617         ret = replicas_table_update(c, &new);
618 err:
619         kfree(new.entries);
620
621         percpu_up_write(&c->mark_lock);
622
623         if (!ret)
624                 bch2_write_super(c);
625
626         mutex_unlock(&c->sb_lock);
627
628         return ret;
629 }
630
631 int bch2_replicas_set_usage(struct bch_fs *c,
632                             struct bch_replicas_entry *r,
633                             u64 sectors)
634 {
635         int ret, idx = bch2_replicas_entry_idx(c, r);
636
637         if (idx < 0) {
638                 struct bch_replicas_cpu n;
639
640                 n = cpu_replicas_add_entry(&c->replicas, r);
641                 if (!n.entries)
642                         return -ENOMEM;
643
644                 ret = replicas_table_update(c, &n);
645                 if (ret)
646                         return ret;
647
648                 kfree(n.entries);
649
650                 idx = bch2_replicas_entry_idx(c, r);
651                 BUG_ON(ret < 0);
652         }
653
654         c->usage_base->replicas[idx] = sectors;
655
656         return 0;
657 }
658
659 /* Replicas tracking - superblock: */
660
661 static int
662 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
663                                    struct bch_replicas_cpu *cpu_r)
664 {
665         struct bch_replicas_entry *e, *dst;
666         unsigned nr = 0, entry_size = 0, idx = 0;
667
668         for_each_replicas_entry(sb_r, e) {
669                 entry_size = max_t(unsigned, entry_size,
670                                    replicas_entry_bytes(e));
671                 nr++;
672         }
673
674         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
675         if (!cpu_r->entries)
676                 return -ENOMEM;
677
678         cpu_r->nr               = nr;
679         cpu_r->entry_size       = entry_size;
680
681         for_each_replicas_entry(sb_r, e) {
682                 dst = cpu_replicas_entry(cpu_r, idx++);
683                 memcpy(dst, e, replicas_entry_bytes(e));
684                 replicas_entry_sort(dst);
685         }
686
687         return 0;
688 }
689
690 static int
691 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
692                                       struct bch_replicas_cpu *cpu_r)
693 {
694         struct bch_replicas_entry_v0 *e;
695         unsigned nr = 0, entry_size = 0, idx = 0;
696
697         for_each_replicas_entry(sb_r, e) {
698                 entry_size = max_t(unsigned, entry_size,
699                                    replicas_entry_bytes(e));
700                 nr++;
701         }
702
703         entry_size += sizeof(struct bch_replicas_entry) -
704                 sizeof(struct bch_replicas_entry_v0);
705
706         cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO);
707         if (!cpu_r->entries)
708                 return -ENOMEM;
709
710         cpu_r->nr               = nr;
711         cpu_r->entry_size       = entry_size;
712
713         for_each_replicas_entry(sb_r, e) {
714                 struct bch_replicas_entry *dst =
715                         cpu_replicas_entry(cpu_r, idx++);
716
717                 dst->data_type  = e->data_type;
718                 dst->nr_devs    = e->nr_devs;
719                 dst->nr_required = 1;
720                 memcpy(dst->devs, e->devs, e->nr_devs);
721                 replicas_entry_sort(dst);
722         }
723
724         return 0;
725 }
726
727 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
728 {
729         struct bch_sb_field_replicas *sb_v1;
730         struct bch_sb_field_replicas_v0 *sb_v0;
731         struct bch_replicas_cpu new_r = { 0, 0, NULL };
732         int ret = 0;
733
734         if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
735                 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
736         else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
737                 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
738
739         if (ret)
740                 return -ENOMEM;
741
742         bch2_cpu_replicas_sort(&new_r);
743
744         percpu_down_write(&c->mark_lock);
745
746         ret = replicas_table_update(c, &new_r);
747         percpu_up_write(&c->mark_lock);
748
749         kfree(new_r.entries);
750
751         return 0;
752 }
753
754 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
755                                                struct bch_replicas_cpu *r)
756 {
757         struct bch_sb_field_replicas_v0 *sb_r;
758         struct bch_replicas_entry_v0 *dst;
759         struct bch_replicas_entry *src;
760         size_t bytes;
761
762         bytes = sizeof(struct bch_sb_field_replicas);
763
764         for_each_cpu_replicas_entry(r, src)
765                 bytes += replicas_entry_bytes(src) - 1;
766
767         sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
768                         DIV_ROUND_UP(bytes, sizeof(u64)));
769         if (!sb_r)
770                 return -ENOSPC;
771
772         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
773         sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
774
775         memset(&sb_r->entries, 0,
776                vstruct_end(&sb_r->field) -
777                (void *) &sb_r->entries);
778
779         dst = sb_r->entries;
780         for_each_cpu_replicas_entry(r, src) {
781                 dst->data_type  = src->data_type;
782                 dst->nr_devs    = src->nr_devs;
783                 memcpy(dst->devs, src->devs, src->nr_devs);
784
785                 dst = replicas_entry_next(dst);
786
787                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
788         }
789
790         return 0;
791 }
792
793 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
794                                             struct bch_replicas_cpu *r)
795 {
796         struct bch_sb_field_replicas *sb_r;
797         struct bch_replicas_entry *dst, *src;
798         bool need_v1 = false;
799         size_t bytes;
800
801         bytes = sizeof(struct bch_sb_field_replicas);
802
803         for_each_cpu_replicas_entry(r, src) {
804                 bytes += replicas_entry_bytes(src);
805                 if (src->nr_required != 1)
806                         need_v1 = true;
807         }
808
809         if (!need_v1)
810                 return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
811
812         sb_r = bch2_sb_resize_replicas(&c->disk_sb,
813                         DIV_ROUND_UP(bytes, sizeof(u64)));
814         if (!sb_r)
815                 return -ENOSPC;
816
817         bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
818         sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
819
820         memset(&sb_r->entries, 0,
821                vstruct_end(&sb_r->field) -
822                (void *) &sb_r->entries);
823
824         dst = sb_r->entries;
825         for_each_cpu_replicas_entry(r, src) {
826                 memcpy(dst, src, replicas_entry_bytes(src));
827
828                 dst = replicas_entry_next(dst);
829
830                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
831         }
832
833         return 0;
834 }
835
836 static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
837 {
838         unsigned i;
839
840         sort_cmp_size(cpu_r->entries,
841                       cpu_r->nr,
842                       cpu_r->entry_size,
843                       memcmp, NULL);
844
845         for (i = 0; i + 1 < cpu_r->nr; i++) {
846                 struct bch_replicas_entry *l =
847                         cpu_replicas_entry(cpu_r, i);
848                 struct bch_replicas_entry *r =
849                         cpu_replicas_entry(cpu_r, i + 1);
850
851                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
852
853                 if (!memcmp(l, r, cpu_r->entry_size))
854                         return "duplicate replicas entry";
855         }
856
857         return NULL;
858 }
859
860 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
861 {
862         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
863         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
864         struct bch_replicas_cpu cpu_r = { .entries = NULL };
865         struct bch_replicas_entry *e;
866         const char *err;
867         unsigned i;
868
869         for_each_replicas_entry(sb_r, e) {
870                 err = "invalid replicas entry: invalid data type";
871                 if (e->data_type >= BCH_DATA_NR)
872                         goto err;
873
874                 err = "invalid replicas entry: no devices";
875                 if (!e->nr_devs)
876                         goto err;
877
878                 err = "invalid replicas entry: bad nr_required";
879                 if (e->nr_required > 1 &&
880                     e->nr_required >= e->nr_devs)
881                         goto err;
882
883                 err = "invalid replicas entry: invalid device";
884                 for (i = 0; i < e->nr_devs; i++)
885                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
886                                 goto err;
887         }
888
889         err = "cannot allocate memory";
890         if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
891                 goto err;
892
893         err = check_dup_replicas_entries(&cpu_r);
894 err:
895         kfree(cpu_r.entries);
896         return err;
897 }
898
899 static void bch2_sb_replicas_to_text(struct printbuf *out,
900                                      struct bch_sb *sb,
901                                      struct bch_sb_field *f)
902 {
903         struct bch_sb_field_replicas *r = field_to_type(f, replicas);
904         struct bch_replicas_entry *e;
905         bool first = true;
906
907         for_each_replicas_entry(r, e) {
908                 if (!first)
909                         pr_buf(out, " ");
910                 first = false;
911
912                 bch2_replicas_entry_to_text(out, e);
913         }
914 }
915
916 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
917         .validate       = bch2_sb_validate_replicas,
918         .to_text        = bch2_sb_replicas_to_text,
919 };
920
921 static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
922 {
923         struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
924         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
925         struct bch_replicas_cpu cpu_r = { .entries = NULL };
926         struct bch_replicas_entry_v0 *e;
927         const char *err;
928         unsigned i;
929
930         for_each_replicas_entry_v0(sb_r, e) {
931                 err = "invalid replicas entry: invalid data type";
932                 if (e->data_type >= BCH_DATA_NR)
933                         goto err;
934
935                 err = "invalid replicas entry: no devices";
936                 if (!e->nr_devs)
937                         goto err;
938
939                 err = "invalid replicas entry: invalid device";
940                 for (i = 0; i < e->nr_devs; i++)
941                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
942                                 goto err;
943         }
944
945         err = "cannot allocate memory";
946         if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
947                 goto err;
948
949         err = check_dup_replicas_entries(&cpu_r);
950 err:
951         kfree(cpu_r.entries);
952         return err;
953 }
954
955 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
956         .validate       = bch2_sb_validate_replicas_v0,
957 };
958
959 /* Query replicas: */
960
961 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
962                                               struct bch_devs_mask online_devs)
963 {
964         struct bch_sb_field_members *mi;
965         struct bch_replicas_entry *e;
966         unsigned i, nr_online, nr_offline;
967         struct replicas_status ret;
968
969         memset(&ret, 0, sizeof(ret));
970
971         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
972                 ret.replicas[i].redundancy = INT_MAX;
973
974         mi = bch2_sb_get_members(c->disk_sb.sb);
975
976         percpu_down_read(&c->mark_lock);
977
978         for_each_cpu_replicas_entry(&c->replicas, e) {
979                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
980                         panic("e %p data_type %u\n", e, e->data_type);
981
982                 nr_online = nr_offline = 0;
983
984                 for (i = 0; i < e->nr_devs; i++) {
985                         BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
986                                                 e->devs[i]));
987
988                         if (test_bit(e->devs[i], online_devs.d))
989                                 nr_online++;
990                         else
991                                 nr_offline++;
992                 }
993
994                 ret.replicas[e->data_type].redundancy =
995                         min(ret.replicas[e->data_type].redundancy,
996                             (int) nr_online - (int) e->nr_required);
997
998                 ret.replicas[e->data_type].nr_offline =
999                         max(ret.replicas[e->data_type].nr_offline,
1000                             nr_offline);
1001         }
1002
1003         percpu_up_read(&c->mark_lock);
1004
1005         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
1006                 if (ret.replicas[i].redundancy == INT_MAX)
1007                         ret.replicas[i].redundancy = 0;
1008
1009         return ret;
1010 }
1011
1012 struct replicas_status bch2_replicas_status(struct bch_fs *c)
1013 {
1014         return __bch2_replicas_status(c, bch2_online_devs(c));
1015 }
1016
1017 static bool have_enough_devs(struct replicas_status s,
1018                              enum bch_data_type type,
1019                              bool force_if_degraded,
1020                              bool force_if_lost)
1021 {
1022         return (!s.replicas[type].nr_offline || force_if_degraded) &&
1023                 (s.replicas[type].redundancy >= 0 || force_if_lost);
1024 }
1025
1026 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
1027 {
1028         return (have_enough_devs(s, BCH_DATA_journal,
1029                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1030                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1031                 have_enough_devs(s, BCH_DATA_btree,
1032                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
1033                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
1034                 have_enough_devs(s, BCH_DATA_user,
1035                                  flags & BCH_FORCE_IF_DATA_DEGRADED,
1036                                  flags & BCH_FORCE_IF_DATA_LOST));
1037 }
1038
1039 int bch2_replicas_online(struct bch_fs *c, bool meta)
1040 {
1041         struct replicas_status s = bch2_replicas_status(c);
1042
1043         return (meta
1044                 ? min(s.replicas[BCH_DATA_journal].redundancy,
1045                       s.replicas[BCH_DATA_btree].redundancy)
1046                 : s.replicas[BCH_DATA_user].redundancy) + 1;
1047 }
1048
1049 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
1050 {
1051         struct bch_replicas_entry *e;
1052         unsigned i, ret = 0;
1053
1054         percpu_down_read(&c->mark_lock);
1055
1056         for_each_cpu_replicas_entry(&c->replicas, e)
1057                 for (i = 0; i < e->nr_devs; i++)
1058                         if (e->devs[i] == ca->dev_idx)
1059                                 ret |= 1 << e->data_type;
1060
1061         percpu_up_read(&c->mark_lock);
1062
1063         return ret;
1064 }
1065
1066 int bch2_fs_replicas_init(struct bch_fs *c)
1067 {
1068         c->journal.entry_u64s_reserved +=
1069                 reserve_journal_replicas(c, &c->replicas);
1070
1071         return replicas_table_update(c, &c->replicas);
1072 }