6 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
7 struct bch_replicas_cpu *);
9 /* Replicas tracking - in memory: */
11 #define for_each_cpu_replicas_entry(_r, _i) \
12 for (_i = (_r)->entries; \
13 (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
14 _i = (void *) (_i) + (_r)->entry_size)
16 static inline struct bch_replicas_cpu_entry *
17 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
19 return (void *) r->entries + r->entry_size * i;
22 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
24 eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
27 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
30 return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
33 static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
36 e->devs[dev >> 3] |= 1 << (dev & 7);
39 static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
41 return (r->entry_size -
42 offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
45 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
46 char *buf, size_t size)
48 char *out = buf, *end = out + size;
49 struct bch_replicas_cpu_entry *e;
53 for_each_cpu_replicas_entry(r, e) {
57 out += scnprintf(out, end - out, " ");
60 out += scnprintf(out, end - out, "%u: [", e->data_type);
62 for (i = 0; i < replicas_dev_slots(r); i++)
63 if (replicas_test_dev(e, i)) {
65 out += scnprintf(out, end - out, " ");
67 out += scnprintf(out, end - out, "%u", i);
69 out += scnprintf(out, end - out, "]");
75 static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
76 enum bch_data_type data_type,
77 struct bch_replicas_cpu_entry *r,
80 const struct bch_extent_ptr *ptr;
84 data_type == BCH_DATA_SB ||
85 data_type >= BCH_DATA_NR);
87 memset(r, 0, sizeof(*r));
88 r->data_type = data_type;
92 extent_for_each_ptr(e, ptr)
94 *max_dev = max_t(unsigned, *max_dev, ptr->dev);
95 replicas_set_dev(r, ptr->dev);
101 static inline void devlist_to_replicas(struct bch_devs_list devs,
102 enum bch_data_type data_type,
103 struct bch_replicas_cpu_entry *r,
109 data_type == BCH_DATA_SB ||
110 data_type >= BCH_DATA_NR);
112 memset(r, 0, sizeof(*r));
113 r->data_type = data_type;
117 for (i = 0; i < devs.nr; i++) {
118 *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
119 replicas_set_dev(r, devs.devs[i]);
123 static struct bch_replicas_cpu *
124 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
125 struct bch_replicas_cpu_entry new_entry,
128 struct bch_replicas_cpu *new;
129 unsigned i, nr, entry_size;
131 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
132 DIV_ROUND_UP(max_dev + 1, 8);
133 entry_size = max(entry_size, old->entry_size);
136 new = kzalloc(sizeof(struct bch_replicas_cpu) +
137 nr * entry_size, GFP_NOIO);
142 new->entry_size = entry_size;
144 for (i = 0; i < old->nr; i++)
145 memcpy(cpu_replicas_entry(new, i),
146 cpu_replicas_entry(old, i),
147 min(new->entry_size, old->entry_size));
149 memcpy(cpu_replicas_entry(new, old->nr),
153 bch2_cpu_replicas_sort(new);
157 static bool replicas_has_entry(struct bch_replicas_cpu *r,
158 struct bch_replicas_cpu_entry search,
161 return max_dev < replicas_dev_slots(r) &&
162 eytzinger0_find(r->entries, r->nr,
164 memcmp, &search) < r->nr;
168 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
169 struct bch_replicas_cpu_entry new_entry,
172 struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
175 mutex_lock(&c->sb_lock);
177 old_gc = rcu_dereference_protected(c->replicas_gc,
178 lockdep_is_held(&c->sb_lock));
179 if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
180 new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
185 old_r = rcu_dereference_protected(c->replicas,
186 lockdep_is_held(&c->sb_lock));
187 if (!replicas_has_entry(old_r, new_entry, max_dev)) {
188 new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
192 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
197 /* allocations done, now commit: */
202 /* don't update in memory replicas until changes are persistent */
205 rcu_assign_pointer(c->replicas_gc, new_gc);
206 kfree_rcu(old_gc, rcu);
210 rcu_assign_pointer(c->replicas, new_r);
211 kfree_rcu(old_r, rcu);
214 mutex_unlock(&c->sb_lock);
217 mutex_unlock(&c->sb_lock);
223 int bch2_mark_replicas(struct bch_fs *c,
224 enum bch_data_type data_type,
225 struct bch_devs_list devs)
227 struct bch_replicas_cpu_entry search;
228 struct bch_replicas_cpu *r, *gc_r;
235 BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
237 devlist_to_replicas(devs, data_type, &search, &max_dev);
240 r = rcu_dereference(c->replicas);
241 gc_r = rcu_dereference(c->replicas_gc);
242 marked = replicas_has_entry(r, search, max_dev) &&
243 (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
246 return likely(marked) ? 0
247 : bch2_mark_replicas_slowpath(c, search, max_dev);
250 int bch2_mark_bkey_replicas(struct bch_fs *c,
251 enum bch_data_type data_type,
254 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
258 for (i = 0; i < cached.nr; i++)
259 if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
260 bch2_dev_list_single(cached.devs[i]))))
263 return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
266 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
268 struct bch_replicas_cpu *new_r, *old_r;
270 lockdep_assert_held(&c->replicas_gc_lock);
272 mutex_lock(&c->sb_lock);
274 new_r = rcu_dereference_protected(c->replicas_gc,
275 lockdep_is_held(&c->sb_lock));
276 rcu_assign_pointer(c->replicas_gc, NULL);
281 if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
288 /* don't update in memory replicas until changes are persistent */
290 old_r = rcu_dereference_protected(c->replicas,
291 lockdep_is_held(&c->sb_lock));
293 rcu_assign_pointer(c->replicas, new_r);
294 kfree_rcu(old_r, rcu);
296 mutex_unlock(&c->sb_lock);
299 kfree_rcu(new_r, rcu);
303 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
305 struct bch_replicas_cpu *dst, *src;
306 struct bch_replicas_cpu_entry *e;
308 lockdep_assert_held(&c->replicas_gc_lock);
310 mutex_lock(&c->sb_lock);
311 BUG_ON(c->replicas_gc);
313 src = rcu_dereference_protected(c->replicas,
314 lockdep_is_held(&c->sb_lock));
316 dst = kzalloc(sizeof(struct bch_replicas_cpu) +
317 src->nr * src->entry_size, GFP_NOIO);
319 mutex_unlock(&c->sb_lock);
324 dst->entry_size = src->entry_size;
326 for_each_cpu_replicas_entry(src, e)
327 if (!((1 << e->data_type) & typemask))
328 memcpy(cpu_replicas_entry(dst, dst->nr++),
331 bch2_cpu_replicas_sort(dst);
333 rcu_assign_pointer(c->replicas_gc, dst);
334 mutex_unlock(&c->sb_lock);
339 /* Replicas tracking - superblock: */
341 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
346 struct bch_replicas_entry *i;
356 for_each_replicas_entry(r, i) {
357 for (j = 0; j < i->nr; j++)
358 *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
362 *bytes = (void *) i - (void *) r;
365 static struct bch_replicas_cpu *
366 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
368 struct bch_replicas_cpu *cpu_r;
369 unsigned i, nr, bytes, max_dev, entry_size;
371 bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
373 entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
374 DIV_ROUND_UP(max_dev + 1, 8);
376 cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
377 nr * entry_size, GFP_NOIO);
382 cpu_r->entry_size = entry_size;
385 struct bch_replicas_cpu_entry *dst =
386 cpu_replicas_entry(cpu_r, 0);
387 struct bch_replicas_entry *src = sb_r->entries;
389 while (dst < cpu_replicas_entry(cpu_r, nr)) {
390 dst->data_type = src->data_type;
391 for (i = 0; i < src->nr; i++)
392 replicas_set_dev(dst, src->devs[i]);
394 src = replicas_entry_next(src);
395 dst = (void *) dst + entry_size;
399 bch2_cpu_replicas_sort(cpu_r);
403 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
405 struct bch_sb_field_replicas *sb_r;
406 struct bch_replicas_cpu *cpu_r, *old_r;
408 sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
409 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
413 old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
414 rcu_assign_pointer(c->replicas, cpu_r);
416 kfree_rcu(old_r, rcu);
421 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
422 struct bch_replicas_cpu *r)
424 struct bch_sb_field_replicas *sb_r;
425 struct bch_replicas_entry *sb_e;
426 struct bch_replicas_cpu_entry *e;
429 bytes = sizeof(struct bch_sb_field_replicas);
431 for_each_cpu_replicas_entry(r, e) {
432 bytes += sizeof(struct bch_replicas_entry);
433 for (i = 0; i < r->entry_size - 1; i++)
434 bytes += hweight8(e->devs[i]);
437 sb_r = bch2_sb_resize_replicas(&c->disk_sb,
438 DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
442 memset(&sb_r->entries, 0,
443 vstruct_end(&sb_r->field) -
444 (void *) &sb_r->entries);
446 sb_e = sb_r->entries;
447 for_each_cpu_replicas_entry(r, e) {
448 sb_e->data_type = e->data_type;
450 for (i = 0; i < replicas_dev_slots(r); i++)
451 if (replicas_test_dev(e, i))
452 sb_e->devs[sb_e->nr++] = i;
454 sb_e = replicas_entry_next(sb_e);
456 BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
462 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
464 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
465 struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
466 struct bch_replicas_cpu *cpu_r = NULL;
467 struct bch_replicas_entry *e;
471 for_each_replicas_entry(sb_r, e) {
472 err = "invalid replicas entry: invalid data type";
473 if (e->data_type >= BCH_DATA_NR)
476 err = "invalid replicas entry: no devices";
480 err = "invalid replicas entry: too many devices";
481 if (e->nr >= BCH_REPLICAS_MAX)
484 err = "invalid replicas entry: invalid device";
485 for (i = 0; i < e->nr; i++)
486 if (!bch2_dev_exists(sb, mi, e->devs[i]))
490 err = "cannot allocate memory";
491 cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
495 sort_cmp_size(cpu_r->entries,
500 for (i = 0; i + 1 < cpu_r->nr; i++) {
501 struct bch_replicas_cpu_entry *l =
502 cpu_replicas_entry(cpu_r, i);
503 struct bch_replicas_cpu_entry *r =
504 cpu_replicas_entry(cpu_r, i + 1);
506 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
508 err = "duplicate replicas entry";
509 if (!memcmp(l, r, cpu_r->entry_size))
519 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
520 .validate = bch2_sb_validate_replicas,
523 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
525 char *out = buf, *end = out + size;
526 struct bch_replicas_entry *e;
531 out += scnprintf(out, end - out, "(no replicas section found)");
535 for_each_replicas_entry(r, e) {
537 out += scnprintf(out, end - out, " ");
540 out += scnprintf(out, end - out, "%u: [", e->data_type);
542 for (i = 0; i < e->nr; i++)
543 out += scnprintf(out, end - out,
544 i ? " %u" : "%u", e->devs[i]);
545 out += scnprintf(out, end - out, "]");
551 /* Query replicas: */
553 bool bch2_replicas_marked(struct bch_fs *c,
554 enum bch_data_type data_type,
555 struct bch_devs_list devs)
557 struct bch_replicas_cpu_entry search;
564 devlist_to_replicas(devs, data_type, &search, &max_dev);
567 ret = replicas_has_entry(rcu_dereference(c->replicas),
574 bool bch2_bkey_replicas_marked(struct bch_fs *c,
575 enum bch_data_type data_type,
578 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
581 for (i = 0; i < cached.nr; i++)
582 if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
583 bch2_dev_list_single(cached.devs[i])))
586 return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
589 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
590 struct bch_devs_mask online_devs)
592 struct bch_sb_field_members *mi;
593 struct bch_replicas_cpu_entry *e;
594 struct bch_replicas_cpu *r;
595 unsigned i, dev, dev_slots, nr_online, nr_offline;
596 struct replicas_status ret;
598 memset(&ret, 0, sizeof(ret));
600 for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
601 ret.replicas[i].nr_online = UINT_MAX;
603 mi = bch2_sb_get_members(c->disk_sb.sb);
606 r = rcu_dereference(c->replicas);
607 dev_slots = replicas_dev_slots(r);
609 for_each_cpu_replicas_entry(r, e) {
610 if (e->data_type >= ARRAY_SIZE(ret.replicas))
611 panic("e %p data_type %u\n", e, e->data_type);
613 nr_online = nr_offline = 0;
615 for (dev = 0; dev < dev_slots; dev++) {
616 if (!replicas_test_dev(e, dev))
619 BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
621 if (test_bit(dev, online_devs.d))
627 ret.replicas[e->data_type].nr_online =
628 min(ret.replicas[e->data_type].nr_online,
631 ret.replicas[e->data_type].nr_offline =
632 max(ret.replicas[e->data_type].nr_offline,
641 struct replicas_status bch2_replicas_status(struct bch_fs *c)
643 return __bch2_replicas_status(c, bch2_online_devs(c));
646 static bool have_enough_devs(struct replicas_status s,
647 enum bch_data_type type,
648 bool force_if_degraded,
651 return (!s.replicas[type].nr_offline || force_if_degraded) &&
652 (s.replicas[type].nr_online || force_if_lost);
655 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
657 return (have_enough_devs(s, BCH_DATA_JOURNAL,
658 flags & BCH_FORCE_IF_METADATA_DEGRADED,
659 flags & BCH_FORCE_IF_METADATA_LOST) &&
660 have_enough_devs(s, BCH_DATA_BTREE,
661 flags & BCH_FORCE_IF_METADATA_DEGRADED,
662 flags & BCH_FORCE_IF_METADATA_LOST) &&
663 have_enough_devs(s, BCH_DATA_USER,
664 flags & BCH_FORCE_IF_DATA_DEGRADED,
665 flags & BCH_FORCE_IF_DATA_LOST));
668 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
670 struct replicas_status s = bch2_replicas_status(c);
673 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
674 s.replicas[BCH_DATA_BTREE].nr_online)
675 : s.replicas[BCH_DATA_USER].nr_online;
678 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
680 struct bch_replicas_cpu_entry *e;
681 struct bch_replicas_cpu *r;
685 r = rcu_dereference(c->replicas);
687 if (ca->dev_idx >= replicas_dev_slots(r))
690 for_each_cpu_replicas_entry(r, e)
691 if (replicas_test_dev(e, ca->dev_idx))
692 ret |= 1 << e->data_type;