6 struct bch_replicas_entry_padded {
7 struct bch_replicas_entry e;
8 u8 pad[BCH_SB_MEMBERS_MAX];
11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
12 struct bch_replicas_cpu *);
14 /* Replicas tracking - in memory: */
16 static inline int u8_cmp(u8 l, u8 r)
18 return (l > r) - (l < r);
21 static void replicas_entry_sort(struct bch_replicas_entry *e)
23 bubble_sort(e->devs, e->nr_devs, u8_cmp);
26 #define for_each_cpu_replicas_entry(_r, _i) \
27 for (_i = (_r)->entries; \
28 (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
29 _i = (void *) (_i) + (_r)->entry_size)
31 static inline struct bch_replicas_entry *
32 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
34 return (void *) r->entries + r->entry_size * i;
37 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
39 eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
42 static int replicas_entry_to_text(struct bch_replicas_entry *e,
43 char *buf, size_t size)
45 char *out = buf, *end = out + size;
48 out += scnprintf(out, end - out, "%u: [", e->data_type);
50 for (i = 0; i < e->nr_devs; i++)
51 out += scnprintf(out, end - out,
52 i ? " %u" : "%u", e->devs[i]);
53 out += scnprintf(out, end - out, "]");
58 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
59 char *buf, size_t size)
61 char *out = buf, *end = out + size;
62 struct bch_replicas_entry *e;
65 for_each_cpu_replicas_entry(r, e) {
67 out += scnprintf(out, end - out, " ");
70 out += replicas_entry_to_text(e, out, end - out);
76 static void extent_to_replicas(struct bkey_s_c k,
77 struct bch_replicas_entry *r)
79 if (bkey_extent_is_data(k.k)) {
80 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
81 const union bch_extent_entry *entry;
82 struct extent_ptr_decoded p;
84 extent_for_each_ptr_decode(e, p, entry)
86 r->devs[r->nr_devs++] = p.ptr.dev;
90 static void bkey_to_replicas(enum bkey_type type,
92 struct bch_replicas_entry *e)
98 e->data_type = BCH_DATA_BTREE;
99 extent_to_replicas(k, e);
101 case BKEY_TYPE_EXTENTS:
102 e->data_type = BCH_DATA_USER;
103 extent_to_replicas(k, e);
109 replicas_entry_sort(e);
112 static inline void devlist_to_replicas(struct bch_devs_list devs,
113 enum bch_data_type data_type,
114 struct bch_replicas_entry *e)
119 data_type == BCH_DATA_SB ||
120 data_type >= BCH_DATA_NR);
122 e->data_type = data_type;
125 for (i = 0; i < devs.nr; i++)
126 e->devs[e->nr_devs++] = devs.devs[i];
128 replicas_entry_sort(e);
131 static struct bch_replicas_cpu *
132 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
133 struct bch_replicas_entry *new_entry)
135 struct bch_replicas_cpu *new;
136 unsigned i, nr, entry_size;
138 entry_size = max_t(unsigned, old->entry_size,
139 replicas_entry_bytes(new_entry));
142 new = kzalloc(sizeof(struct bch_replicas_cpu) +
143 nr * entry_size, GFP_NOIO);
148 new->entry_size = entry_size;
150 for (i = 0; i < old->nr; i++)
151 memcpy(cpu_replicas_entry(new, i),
152 cpu_replicas_entry(old, i),
155 memcpy(cpu_replicas_entry(new, old->nr),
157 replicas_entry_bytes(new_entry));
159 bch2_cpu_replicas_sort(new);
163 static bool replicas_has_entry(struct bch_replicas_cpu *r,
164 struct bch_replicas_entry *search)
166 return replicas_entry_bytes(search) <= r->entry_size &&
167 eytzinger0_find(r->entries, r->nr,
169 memcmp, search) < r->nr;
173 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
174 struct bch_replicas_entry *new_entry)
176 struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
179 mutex_lock(&c->sb_lock);
181 old_gc = rcu_dereference_protected(c->replicas_gc,
182 lockdep_is_held(&c->sb_lock));
183 if (old_gc && !replicas_has_entry(old_gc, new_entry)) {
184 new_gc = cpu_replicas_add_entry(old_gc, new_entry);
189 old_r = rcu_dereference_protected(c->replicas,
190 lockdep_is_held(&c->sb_lock));
191 if (!replicas_has_entry(old_r, new_entry)) {
192 new_r = cpu_replicas_add_entry(old_r, new_entry);
196 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
201 /* allocations done, now commit: */
206 /* don't update in memory replicas until changes are persistent */
209 rcu_assign_pointer(c->replicas_gc, new_gc);
210 kfree_rcu(old_gc, rcu);
214 rcu_assign_pointer(c->replicas, new_r);
215 kfree_rcu(old_r, rcu);
218 mutex_unlock(&c->sb_lock);
221 mutex_unlock(&c->sb_lock);
227 static int __bch2_mark_replicas(struct bch_fs *c,
228 struct bch_replicas_entry *devs)
230 struct bch_replicas_cpu *r, *gc_r;
234 r = rcu_dereference(c->replicas);
235 gc_r = rcu_dereference(c->replicas_gc);
236 marked = replicas_has_entry(r, devs) &&
237 (!likely(gc_r) || replicas_has_entry(gc_r, devs));
240 return likely(marked) ? 0
241 : bch2_mark_replicas_slowpath(c, devs);
244 int bch2_mark_replicas(struct bch_fs *c,
245 enum bch_data_type data_type,
246 struct bch_devs_list devs)
248 struct bch_replicas_entry_padded search;
253 memset(&search, 0, sizeof(search));
255 BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
257 devlist_to_replicas(devs, data_type, &search.e);
259 return __bch2_mark_replicas(c, &search.e);
262 int bch2_mark_bkey_replicas(struct bch_fs *c,
266 struct bch_replicas_entry_padded search;
269 if (type == BKEY_TYPE_EXTENTS) {
270 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
273 for (i = 0; i < cached.nr; i++)
274 if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
275 bch2_dev_list_single(cached.devs[i]))))
279 bkey_to_replicas(type, k, &search.e);
281 return search.e.nr_devs
282 ? __bch2_mark_replicas(c, &search.e)
286 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
288 struct bch_replicas_cpu *new_r, *old_r;
290 lockdep_assert_held(&c->replicas_gc_lock);
292 mutex_lock(&c->sb_lock);
294 new_r = rcu_dereference_protected(c->replicas_gc,
295 lockdep_is_held(&c->sb_lock));
296 rcu_assign_pointer(c->replicas_gc, NULL);
301 if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
308 /* don't update in memory replicas until changes are persistent */
310 old_r = rcu_dereference_protected(c->replicas,
311 lockdep_is_held(&c->sb_lock));
313 rcu_assign_pointer(c->replicas, new_r);
314 kfree_rcu(old_r, rcu);
316 mutex_unlock(&c->sb_lock);
319 kfree_rcu(new_r, rcu);
323 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
325 struct bch_replicas_cpu *dst, *src;
326 struct bch_replicas_entry *e;
328 lockdep_assert_held(&c->replicas_gc_lock);
330 mutex_lock(&c->sb_lock);
331 BUG_ON(c->replicas_gc);
333 src = rcu_dereference_protected(c->replicas,
334 lockdep_is_held(&c->sb_lock));
336 dst = kzalloc(sizeof(struct bch_replicas_cpu) +
337 src->nr * src->entry_size, GFP_NOIO);
339 mutex_unlock(&c->sb_lock);
344 dst->entry_size = src->entry_size;
346 for_each_cpu_replicas_entry(src, e)
347 if (!((1 << e->data_type) & typemask))
348 memcpy(cpu_replicas_entry(dst, dst->nr++),
351 bch2_cpu_replicas_sort(dst);
353 rcu_assign_pointer(c->replicas_gc, dst);
354 mutex_unlock(&c->sb_lock);
359 /* Replicas tracking - superblock: */
361 static struct bch_replicas_cpu *
362 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
364 struct bch_replicas_entry *e, *dst;
365 struct bch_replicas_cpu *cpu_r;
366 unsigned nr = 0, entry_size = 0;
369 for_each_replicas_entry(sb_r, e) {
370 entry_size = max_t(unsigned, entry_size,
371 replicas_entry_bytes(e));
375 cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
376 nr * entry_size, GFP_NOIO);
381 cpu_r->entry_size = entry_size;
386 for_each_replicas_entry(sb_r, e) {
387 dst = cpu_replicas_entry(cpu_r, nr++);
388 memcpy(dst, e, replicas_entry_bytes(e));
389 replicas_entry_sort(dst);
392 bch2_cpu_replicas_sort(cpu_r);
396 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
398 struct bch_sb_field_replicas *sb_r;
399 struct bch_replicas_cpu *cpu_r, *old_r;
401 sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
402 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
406 old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
407 rcu_assign_pointer(c->replicas, cpu_r);
409 kfree_rcu(old_r, rcu);
414 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
415 struct bch_replicas_cpu *r)
417 struct bch_sb_field_replicas *sb_r;
418 struct bch_replicas_entry *dst, *src;
421 bytes = sizeof(struct bch_sb_field_replicas);
423 for_each_cpu_replicas_entry(r, src)
424 bytes += replicas_entry_bytes(src);
426 sb_r = bch2_sb_resize_replicas(&c->disk_sb,
427 DIV_ROUND_UP(bytes, sizeof(u64)));
431 memset(&sb_r->entries, 0,
432 vstruct_end(&sb_r->field) -
433 (void *) &sb_r->entries);
436 for_each_cpu_replicas_entry(r, src) {
437 memcpy(dst, src, replicas_entry_bytes(src));
439 dst = replicas_entry_next(dst);
441 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
447 static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
451 sort_cmp_size(cpu_r->entries,
456 for (i = 0; i + 1 < cpu_r->nr; i++) {
457 struct bch_replicas_entry *l =
458 cpu_replicas_entry(cpu_r, i);
459 struct bch_replicas_entry *r =
460 cpu_replicas_entry(cpu_r, i + 1);
462 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
464 if (!memcmp(l, r, cpu_r->entry_size))
465 return "duplicate replicas entry";
471 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
473 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
474 struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
475 struct bch_replicas_cpu *cpu_r = NULL;
476 struct bch_replicas_entry *e;
480 for_each_replicas_entry(sb_r, e) {
481 err = "invalid replicas entry: invalid data type";
482 if (e->data_type >= BCH_DATA_NR)
485 err = "invalid replicas entry: no devices";
489 err = "invalid replicas entry: too many devices";
490 if (e->nr_devs >= BCH_REPLICAS_MAX)
493 err = "invalid replicas entry: invalid device";
494 for (i = 0; i < e->nr_devs; i++)
495 if (!bch2_dev_exists(sb, mi, e->devs[i]))
499 err = "cannot allocate memory";
500 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
504 err = check_dup_replicas_entries(cpu_r);
510 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
511 .validate = bch2_sb_validate_replicas,
514 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
516 char *out = buf, *end = out + size;
517 struct bch_replicas_entry *e;
521 out += scnprintf(out, end - out, "(no replicas section found)");
525 for_each_replicas_entry(r, e) {
527 out += scnprintf(out, end - out, " ");
530 out += replicas_entry_to_text(e, out, end - out);
536 /* Query replicas: */
538 bool bch2_replicas_marked(struct bch_fs *c,
539 enum bch_data_type data_type,
540 struct bch_devs_list devs)
542 struct bch_replicas_entry_padded search;
548 memset(&search, 0, sizeof(search));
550 devlist_to_replicas(devs, data_type, &search.e);
553 ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
559 bool bch2_bkey_replicas_marked(struct bch_fs *c,
563 struct bch_replicas_entry_padded search;
566 if (type == BKEY_TYPE_EXTENTS) {
567 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
570 for (i = 0; i < cached.nr; i++)
571 if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
572 bch2_dev_list_single(cached.devs[i])))
576 bkey_to_replicas(type, k, &search.e);
578 if (!search.e.nr_devs)
582 ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
588 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
589 struct bch_devs_mask online_devs)
591 struct bch_sb_field_members *mi;
592 struct bch_replicas_entry *e;
593 struct bch_replicas_cpu *r;
594 unsigned i, nr_online, nr_offline;
595 struct replicas_status ret;
597 memset(&ret, 0, sizeof(ret));
599 for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
600 ret.replicas[i].nr_online = UINT_MAX;
602 mi = bch2_sb_get_members(c->disk_sb.sb);
604 r = rcu_dereference(c->replicas);
606 for_each_cpu_replicas_entry(r, e) {
607 if (e->data_type >= ARRAY_SIZE(ret.replicas))
608 panic("e %p data_type %u\n", e, e->data_type);
610 nr_online = nr_offline = 0;
612 for (i = 0; i < e->nr_devs; i++) {
613 BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
616 if (test_bit(e->devs[i], online_devs.d))
622 ret.replicas[e->data_type].nr_online =
623 min(ret.replicas[e->data_type].nr_online,
626 ret.replicas[e->data_type].nr_offline =
627 max(ret.replicas[e->data_type].nr_offline,
636 struct replicas_status bch2_replicas_status(struct bch_fs *c)
638 return __bch2_replicas_status(c, bch2_online_devs(c));
641 static bool have_enough_devs(struct replicas_status s,
642 enum bch_data_type type,
643 bool force_if_degraded,
646 return (!s.replicas[type].nr_offline || force_if_degraded) &&
647 (s.replicas[type].nr_online || force_if_lost);
650 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
652 return (have_enough_devs(s, BCH_DATA_JOURNAL,
653 flags & BCH_FORCE_IF_METADATA_DEGRADED,
654 flags & BCH_FORCE_IF_METADATA_LOST) &&
655 have_enough_devs(s, BCH_DATA_BTREE,
656 flags & BCH_FORCE_IF_METADATA_DEGRADED,
657 flags & BCH_FORCE_IF_METADATA_LOST) &&
658 have_enough_devs(s, BCH_DATA_USER,
659 flags & BCH_FORCE_IF_DATA_DEGRADED,
660 flags & BCH_FORCE_IF_DATA_LOST));
663 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
665 struct replicas_status s = bch2_replicas_status(c);
668 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
669 s.replicas[BCH_DATA_BTREE].nr_online)
670 : s.replicas[BCH_DATA_USER].nr_online;
673 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
675 struct bch_replicas_entry *e;
676 struct bch_replicas_cpu *r;
680 r = rcu_dereference(c->replicas);
682 for_each_cpu_replicas_entry(r, e)
683 for (i = 0; i < e->nr_devs; i++)
684 if (e->devs[i] == ca->dev_idx)
685 ret |= 1 << e->data_type;