6 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
7 struct bch_replicas_cpu *);
9 /* Replicas tracking - in memory: */
11 #define for_each_cpu_replicas_entry(_r, _i) \
12 for (_i = (_r)->entries; \
13 (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
14 _i = (void *) (_i) + (_r)->entry_size)
16 static inline struct bch_replicas_cpu_entry *
17 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
19 return (void *) r->entries + r->entry_size * i;
22 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
24 eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
27 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
30 return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
33 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
36 e->devs[dev >> 3] |= 1 << (dev & 7);
39 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
41 return (r->entry_size -
42 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
45 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
46 char *buf, size_t size)
48 char *out = buf, *end = out + size;
49 struct bch_replicas_cpu_entry *e;
53 for_each_cpu_replicas_entry(r, e) {
57 out += scnprintf(out, end - out, " ");
60 out += scnprintf(out, end - out, "%u: [", e->data_type);
62 for (i = 0; i < replicas_dev_slots(r); i++)
63 if (replicas_test_dev(e, i)) {
65 out += scnprintf(out, end - out, " ");
67 out += scnprintf(out, end - out, "%u", i);
69 out += scnprintf(out, end - out, "]");
75 static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
76 enum bch_data_type data_type,
77 struct bch_replicas_cpu_entry *r,
80 const struct bch_extent_ptr *ptr;
84 data_type == BCH_DATA_SB ||
85 data_type >= BCH_DATA_NR);
87 memset(r, 0, sizeof(*r));
88 r->data_type = data_type;
92 extent_for_each_ptr(e, ptr)
94 *max_dev = max_t(unsigned, *max_dev, ptr->dev);
95 replicas_set_dev(r, ptr->dev);
101 static inline void devlist_to_replicas(struct bch_devs_list devs,
102 enum bch_data_type data_type,
103 struct bch_replicas_cpu_entry *r,
109 data_type == BCH_DATA_SB ||
110 data_type >= BCH_DATA_NR);
112 memset(r, 0, sizeof(*r));
113 r->data_type = data_type;
117 for (i = 0; i < devs.nr; i++) {
118 *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
119 replicas_set_dev(r, devs.devs[i]);
123 static struct bch_replicas_cpu *
124 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
125 struct bch_replicas_cpu_entry new_entry,
128 struct bch_replicas_cpu *new;
129 unsigned i, nr, entry_size;
131 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
132 DIV_ROUND_UP(max_dev + 1, 8);
133 entry_size = max(entry_size, old->entry_size);
136 new = kzalloc(sizeof(struct bch_replicas_cpu) +
137 nr * entry_size, GFP_NOIO);
142 new->entry_size = entry_size;
144 for (i = 0; i < old->nr; i++)
145 memcpy(cpu_replicas_entry(new, i),
146 cpu_replicas_entry(old, i),
147 min(new->entry_size, old->entry_size));
149 memcpy(cpu_replicas_entry(new, old->nr),
153 bch2_cpu_replicas_sort(new);
157 static bool replicas_has_entry(struct bch_replicas_cpu *r,
158 struct bch_replicas_cpu_entry search,
161 return max_dev < replicas_dev_slots(r) &&
162 eytzinger0_find(r->entries, r->nr,
164 memcmp, &search) < r->nr;
168 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
169 struct bch_replicas_cpu_entry new_entry,
172 struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
175 mutex_lock(&c->sb_lock);
177 old_gc = rcu_dereference_protected(c->replicas_gc,
178 lockdep_is_held(&c->sb_lock));
179 if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
180 new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
185 old_r = rcu_dereference_protected(c->replicas,
186 lockdep_is_held(&c->sb_lock));
187 if (!replicas_has_entry(old_r, new_entry, max_dev)) {
188 new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
192 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
197 /* allocations done, now commit: */
202 /* don't update in memory replicas until changes are persistent */
205 rcu_assign_pointer(c->replicas_gc, new_gc);
206 kfree_rcu(old_gc, rcu);
210 rcu_assign_pointer(c->replicas, new_r);
211 kfree_rcu(old_r, rcu);
214 mutex_unlock(&c->sb_lock);
217 mutex_unlock(&c->sb_lock);
225 int bch2_mark_replicas(struct bch_fs *c,
226 enum bch_data_type data_type,
227 struct bch_devs_list devs)
229 struct bch_replicas_cpu_entry search;
230 struct bch_replicas_cpu *r, *gc_r;
237 BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
239 devlist_to_replicas(devs, data_type, &search, &max_dev);
242 r = rcu_dereference(c->replicas);
243 gc_r = rcu_dereference(c->replicas_gc);
244 marked = replicas_has_entry(r, search, max_dev) &&
245 (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
248 return likely(marked) ? 0
249 : bch2_mark_replicas_slowpath(c, search, max_dev);
252 int bch2_mark_bkey_replicas(struct bch_fs *c,
253 enum bch_data_type data_type,
256 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
260 for (i = 0; i < cached.nr; i++)
261 if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
262 bch2_dev_list_single(cached.devs[i]))))
265 return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
268 int bch2_replicas_gc_end(struct bch_fs *c, int err)
270 struct bch_replicas_cpu *new_r, *old_r;
273 lockdep_assert_held(&c->replicas_gc_lock);
275 mutex_lock(&c->sb_lock);
277 new_r = rcu_dereference_protected(c->replicas_gc,
278 lockdep_is_held(&c->sb_lock));
281 rcu_assign_pointer(c->replicas_gc, NULL);
282 kfree_rcu(new_r, rcu);
286 if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
291 old_r = rcu_dereference_protected(c->replicas,
292 lockdep_is_held(&c->sb_lock));
294 rcu_assign_pointer(c->replicas, new_r);
295 rcu_assign_pointer(c->replicas_gc, NULL);
296 kfree_rcu(old_r, rcu);
300 mutex_unlock(&c->sb_lock);
304 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
306 struct bch_replicas_cpu *dst, *src;
307 struct bch_replicas_cpu_entry *e;
309 lockdep_assert_held(&c->replicas_gc_lock);
311 mutex_lock(&c->sb_lock);
312 BUG_ON(c->replicas_gc);
314 src = rcu_dereference_protected(c->replicas,
315 lockdep_is_held(&c->sb_lock));
317 dst = kzalloc(sizeof(struct bch_replicas_cpu) +
318 src->nr * src->entry_size, GFP_NOIO);
320 mutex_unlock(&c->sb_lock);
325 dst->entry_size = src->entry_size;
327 for_each_cpu_replicas_entry(src, e)
328 if (!((1 << e->data_type) & typemask))
329 memcpy(cpu_replicas_entry(dst, dst->nr++),
332 bch2_cpu_replicas_sort(dst);
334 rcu_assign_pointer(c->replicas_gc, dst);
335 mutex_unlock(&c->sb_lock);
340 /* Replicas tracking - superblock: */
342 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
347 struct bch_replicas_entry *i;
357 for_each_replicas_entry(r, i) {
358 for (j = 0; j < i->nr; j++)
359 *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
363 *bytes = (void *) i - (void *) r;
366 static struct bch_replicas_cpu *
367 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
369 struct bch_replicas_cpu *cpu_r;
370 unsigned i, nr, bytes, max_dev, entry_size;
372 bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
374 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
375 DIV_ROUND_UP(max_dev + 1, 8);
377 cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
378 nr * entry_size, GFP_NOIO);
383 cpu_r->entry_size = entry_size;
386 struct bch_replicas_cpu_entry *dst =
387 cpu_replicas_entry(cpu_r, 0);
388 struct bch_replicas_entry *src = sb_r->entries;
390 while (dst < cpu_replicas_entry(cpu_r, nr)) {
391 dst->data_type = src->data_type;
392 for (i = 0; i < src->nr; i++)
393 replicas_set_dev(dst, src->devs[i]);
395 src = replicas_entry_next(src);
396 dst = (void *) dst + entry_size;
400 bch2_cpu_replicas_sort(cpu_r);
404 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
406 struct bch_sb_field_replicas *sb_r;
407 struct bch_replicas_cpu *cpu_r, *old_r;
409 sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
410 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
414 old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
415 rcu_assign_pointer(c->replicas, cpu_r);
417 kfree_rcu(old_r, rcu);
422 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
423 struct bch_replicas_cpu *r)
425 struct bch_sb_field_replicas *sb_r;
426 struct bch_replicas_entry *sb_e;
427 struct bch_replicas_cpu_entry *e;
430 bytes = sizeof(struct bch_sb_field_replicas);
432 for_each_cpu_replicas_entry(r, e) {
433 bytes += sizeof(struct bch_replicas_entry);
434 for (i = 0; i < r->entry_size - 1; i++)
435 bytes += hweight8(e->devs[i]);
438 sb_r = bch2_sb_resize_replicas(&c->disk_sb,
439 DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
443 memset(&sb_r->entries, 0,
444 vstruct_end(&sb_r->field) -
445 (void *) &sb_r->entries);
447 sb_e = sb_r->entries;
448 for_each_cpu_replicas_entry(r, e) {
449 sb_e->data_type = e->data_type;
451 for (i = 0; i < replicas_dev_slots(r); i++)
452 if (replicas_test_dev(e, i))
453 sb_e->devs[sb_e->nr++] = i;
455 sb_e = replicas_entry_next(sb_e);
457 BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
463 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
465 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
466 struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
467 struct bch_replicas_cpu *cpu_r = NULL;
468 struct bch_replicas_entry *e;
472 for_each_replicas_entry(sb_r, e) {
473 err = "invalid replicas entry: invalid data type";
474 if (e->data_type >= BCH_DATA_NR)
477 err = "invalid replicas entry: no devices";
481 err = "invalid replicas entry: too many devices";
482 if (e->nr >= BCH_REPLICAS_MAX)
485 err = "invalid replicas entry: invalid device";
486 for (i = 0; i < e->nr; i++)
487 if (!bch2_dev_exists(sb, mi, e->devs[i]))
491 err = "cannot allocate memory";
492 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
496 sort_cmp_size(cpu_r->entries,
501 for (i = 0; i + 1 < cpu_r->nr; i++) {
502 struct bch_replicas_cpu_entry *l =
503 cpu_replicas_entry(cpu_r, i);
504 struct bch_replicas_cpu_entry *r =
505 cpu_replicas_entry(cpu_r, i + 1);
507 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
509 err = "duplicate replicas entry";
510 if (!memcmp(l, r, cpu_r->entry_size))
520 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
521 .validate = bch2_sb_validate_replicas,
524 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
526 char *out = buf, *end = out + size;
527 struct bch_replicas_entry *e;
532 out += scnprintf(out, end - out, "(no replicas section found)");
536 for_each_replicas_entry(r, e) {
538 out += scnprintf(out, end - out, " ");
541 out += scnprintf(out, end - out, "%u: [", e->data_type);
543 for (i = 0; i < e->nr; i++)
544 out += scnprintf(out, end - out,
545 i ? " %u" : "%u", e->devs[i]);
546 out += scnprintf(out, end - out, "]");
552 /* Query replicas: */
554 bool bch2_replicas_marked(struct bch_fs *c,
555 enum bch_data_type data_type,
556 struct bch_devs_list devs)
558 struct bch_replicas_cpu_entry search;
565 devlist_to_replicas(devs, data_type, &search, &max_dev);
568 ret = replicas_has_entry(rcu_dereference(c->replicas),
575 bool bch2_bkey_replicas_marked(struct bch_fs *c,
576 enum bch_data_type data_type,
579 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
582 for (i = 0; i < cached.nr; i++)
583 if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
584 bch2_dev_list_single(cached.devs[i])))
587 return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
590 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
591 struct bch_devs_mask online_devs)
593 struct bch_sb_field_members *mi;
594 struct bch_replicas_cpu_entry *e;
595 struct bch_replicas_cpu *r;
596 unsigned i, dev, dev_slots, nr_online, nr_offline;
597 struct replicas_status ret;
599 memset(&ret, 0, sizeof(ret));
601 for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
602 ret.replicas[i].nr_online = UINT_MAX;
604 mi = bch2_sb_get_members(c->disk_sb.sb);
607 r = rcu_dereference(c->replicas);
608 dev_slots = replicas_dev_slots(r);
610 for_each_cpu_replicas_entry(r, e) {
611 if (e->data_type >= ARRAY_SIZE(ret.replicas))
612 panic("e %p data_type %u\n", e, e->data_type);
614 nr_online = nr_offline = 0;
616 for (dev = 0; dev < dev_slots; dev++) {
617 if (!replicas_test_dev(e, dev))
620 BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
622 if (test_bit(dev, online_devs.d))
628 ret.replicas[e->data_type].nr_online =
629 min(ret.replicas[e->data_type].nr_online,
632 ret.replicas[e->data_type].nr_offline =
633 max(ret.replicas[e->data_type].nr_offline,
642 struct replicas_status bch2_replicas_status(struct bch_fs *c)
644 return __bch2_replicas_status(c, bch2_online_devs(c));
647 static bool have_enough_devs(struct replicas_status s,
648 enum bch_data_type type,
649 bool force_if_degraded,
652 return (!s.replicas[type].nr_offline || force_if_degraded) &&
653 (s.replicas[type].nr_online || force_if_lost);
656 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
658 return (have_enough_devs(s, BCH_DATA_JOURNAL,
659 flags & BCH_FORCE_IF_METADATA_DEGRADED,
660 flags & BCH_FORCE_IF_METADATA_LOST) &&
661 have_enough_devs(s, BCH_DATA_BTREE,
662 flags & BCH_FORCE_IF_METADATA_DEGRADED,
663 flags & BCH_FORCE_IF_METADATA_LOST) &&
664 have_enough_devs(s, BCH_DATA_USER,
665 flags & BCH_FORCE_IF_DATA_DEGRADED,
666 flags & BCH_FORCE_IF_DATA_LOST));
669 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
671 struct replicas_status s = bch2_replicas_status(c);
674 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
675 s.replicas[BCH_DATA_BTREE].nr_online)
676 : s.replicas[BCH_DATA_USER].nr_online;
679 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
681 struct bch_replicas_cpu_entry *e;
682 struct bch_replicas_cpu *r;
686 r = rcu_dereference(c->replicas);
688 if (ca->dev_idx >= replicas_dev_slots(r))
691 for_each_cpu_replicas_entry(r, e)
692 if (replicas_test_dev(e, ca->dev_idx))
693 ret |= 1 << e->data_type;