]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/replicas.c
Update bcachefs sources to b12d1535f3 bcachefs: fix bounds checks in bch2_bio_map()
[bcachefs-tools-debian] / libbcachefs / replicas.c
1
2 #include "bcachefs.h"
3 #include "replicas.h"
4 #include "super-io.h"
5
6 struct bch_replicas_entry_padded {
7         struct bch_replicas_entry       e;
8         u8                              pad[BCH_SB_MEMBERS_MAX];
9 };
10
11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
12                                             struct bch_replicas_cpu *);
13
14 /* Replicas tracking - in memory: */
15
16 static inline int u8_cmp(u8 l, u8 r)
17 {
18         return (l > r) - (l < r);
19 }
20
21 static void replicas_entry_sort(struct bch_replicas_entry *e)
22 {
23         bubble_sort(e->devs, e->nr_devs, u8_cmp);
24 }
25
26 #define for_each_cpu_replicas_entry(_r, _i)                             \
27         for (_i = (_r)->entries;                                        \
28              (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
29              _i = (void *) (_i) + (_r)->entry_size)
30
31 static inline struct bch_replicas_entry *
32 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
33 {
34         return (void *) r->entries + r->entry_size * i;
35 }
36
37 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
38 {
39         eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
40 }
41
42 static int replicas_entry_to_text(struct bch_replicas_entry *e,
43                                   char *buf, size_t size)
44 {
45         char *out = buf, *end = out + size;
46         unsigned i;
47
48         out += scnprintf(out, end - out, "%u: [", e->data_type);
49
50         for (i = 0; i < e->nr_devs; i++)
51                 out += scnprintf(out, end - out,
52                                  i ? " %u" : "%u", e->devs[i]);
53         out += scnprintf(out, end - out, "]");
54
55         return out - buf;
56 }
57
58 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
59                               char *buf, size_t size)
60 {
61         char *out = buf, *end = out + size;
62         struct bch_replicas_entry *e;
63         bool first = true;
64
65         for_each_cpu_replicas_entry(r, e) {
66                 if (!first)
67                         out += scnprintf(out, end - out, " ");
68                 first = false;
69
70                 out += replicas_entry_to_text(e, out, end - out);
71         }
72
73         return out - buf;
74 }
75
76 static void extent_to_replicas(struct bkey_s_c k,
77                                struct bch_replicas_entry *r)
78 {
79         if (bkey_extent_is_data(k.k)) {
80                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
81                 const union bch_extent_entry *entry;
82                 struct extent_ptr_decoded p;
83
84                 extent_for_each_ptr_decode(e, p, entry)
85                         if (!p.ptr.cached)
86                                 r->devs[r->nr_devs++] = p.ptr.dev;
87         }
88 }
89
90 static void bkey_to_replicas(enum bkey_type type,
91                              struct bkey_s_c k,
92                              struct bch_replicas_entry *e)
93 {
94         e->nr_devs = 0;
95
96         switch (type) {
97         case BKEY_TYPE_BTREE:
98                 e->data_type = BCH_DATA_BTREE;
99                 extent_to_replicas(k, e);
100                 break;
101         case BKEY_TYPE_EXTENTS:
102                 e->data_type = BCH_DATA_USER;
103                 extent_to_replicas(k, e);
104                 break;
105         default:
106                 break;
107         }
108
109         replicas_entry_sort(e);
110 }
111
112 static inline void devlist_to_replicas(struct bch_devs_list devs,
113                                        enum bch_data_type data_type,
114                                        struct bch_replicas_entry *e)
115 {
116         unsigned i;
117
118         BUG_ON(!data_type ||
119                data_type == BCH_DATA_SB ||
120                data_type >= BCH_DATA_NR);
121
122         e->data_type    = data_type;
123         e->nr_devs      = 0;
124
125         for (i = 0; i < devs.nr; i++)
126                 e->devs[e->nr_devs++] = devs.devs[i];
127
128         replicas_entry_sort(e);
129 }
130
131 static struct bch_replicas_cpu *
132 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
133                        struct bch_replicas_entry *new_entry)
134 {
135         struct bch_replicas_cpu *new;
136         unsigned i, nr, entry_size;
137
138         entry_size = max_t(unsigned, old->entry_size,
139                            replicas_entry_bytes(new_entry));
140         nr = old->nr + 1;
141
142         new = kzalloc(sizeof(struct bch_replicas_cpu) +
143                       nr * entry_size, GFP_NOIO);
144         if (!new)
145                 return NULL;
146
147         new->nr         = nr;
148         new->entry_size = entry_size;
149
150         for (i = 0; i < old->nr; i++)
151                 memcpy(cpu_replicas_entry(new, i),
152                        cpu_replicas_entry(old, i),
153                        old->entry_size);
154
155         memcpy(cpu_replicas_entry(new, old->nr),
156                new_entry,
157                replicas_entry_bytes(new_entry));
158
159         bch2_cpu_replicas_sort(new);
160         return new;
161 }
162
163 static bool replicas_has_entry(struct bch_replicas_cpu *r,
164                                struct bch_replicas_entry *search)
165 {
166         return replicas_entry_bytes(search) <= r->entry_size &&
167                 eytzinger0_find(r->entries, r->nr,
168                                 r->entry_size,
169                                 memcmp, search) < r->nr;
170 }
171
172 noinline
173 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
174                                 struct bch_replicas_entry *new_entry)
175 {
176         struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
177         int ret = -ENOMEM;
178
179         mutex_lock(&c->sb_lock);
180
181         old_gc = rcu_dereference_protected(c->replicas_gc,
182                                            lockdep_is_held(&c->sb_lock));
183         if (old_gc && !replicas_has_entry(old_gc, new_entry)) {
184                 new_gc = cpu_replicas_add_entry(old_gc, new_entry);
185                 if (!new_gc)
186                         goto err;
187         }
188
189         old_r = rcu_dereference_protected(c->replicas,
190                                           lockdep_is_held(&c->sb_lock));
191         if (!replicas_has_entry(old_r, new_entry)) {
192                 new_r = cpu_replicas_add_entry(old_r, new_entry);
193                 if (!new_r)
194                         goto err;
195
196                 ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
197                 if (ret)
198                         goto err;
199         }
200
201         /* allocations done, now commit: */
202
203         if (new_r)
204                 bch2_write_super(c);
205
206         /* don't update in memory replicas until changes are persistent */
207
208         if (new_gc) {
209                 rcu_assign_pointer(c->replicas_gc, new_gc);
210                 kfree_rcu(old_gc, rcu);
211         }
212
213         if (new_r) {
214                 rcu_assign_pointer(c->replicas, new_r);
215                 kfree_rcu(old_r, rcu);
216         }
217
218         mutex_unlock(&c->sb_lock);
219         return 0;
220 err:
221         mutex_unlock(&c->sb_lock);
222         kfree(new_gc);
223         kfree(new_r);
224         return ret;
225 }
226
227 static int __bch2_mark_replicas(struct bch_fs *c,
228                                 struct bch_replicas_entry *devs)
229 {
230         struct bch_replicas_cpu *r, *gc_r;
231         bool marked;
232
233         rcu_read_lock();
234         r = rcu_dereference(c->replicas);
235         gc_r = rcu_dereference(c->replicas_gc);
236         marked = replicas_has_entry(r, devs) &&
237                 (!likely(gc_r) || replicas_has_entry(gc_r, devs));
238         rcu_read_unlock();
239
240         return likely(marked) ? 0
241                 : bch2_mark_replicas_slowpath(c, devs);
242 }
243
244 int bch2_mark_replicas(struct bch_fs *c,
245                        enum bch_data_type data_type,
246                        struct bch_devs_list devs)
247 {
248         struct bch_replicas_entry_padded search;
249
250         if (!devs.nr)
251                 return 0;
252
253         memset(&search, 0, sizeof(search));
254
255         BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
256
257         devlist_to_replicas(devs, data_type, &search.e);
258
259         return __bch2_mark_replicas(c, &search.e);
260 }
261
262 int bch2_mark_bkey_replicas(struct bch_fs *c,
263                             enum bkey_type type,
264                             struct bkey_s_c k)
265 {
266         struct bch_replicas_entry_padded search;
267         int ret;
268
269         if (type == BKEY_TYPE_EXTENTS) {
270                 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
271                 unsigned i;
272
273                 for (i = 0; i < cached.nr; i++)
274                         if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
275                                                 bch2_dev_list_single(cached.devs[i]))))
276                                 return ret;
277         }
278
279         bkey_to_replicas(type, k, &search.e);
280
281         return search.e.nr_devs
282                 ? __bch2_mark_replicas(c, &search.e)
283                 : 0;
284 }
285
286 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
287 {
288         struct bch_replicas_cpu *new_r, *old_r;
289
290         lockdep_assert_held(&c->replicas_gc_lock);
291
292         mutex_lock(&c->sb_lock);
293
294         new_r = rcu_dereference_protected(c->replicas_gc,
295                                           lockdep_is_held(&c->sb_lock));
296         rcu_assign_pointer(c->replicas_gc, NULL);
297
298         if (ret)
299                 goto err;
300
301         if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
302                 ret = -ENOSPC;
303                 goto err;
304         }
305
306         bch2_write_super(c);
307
308         /* don't update in memory replicas until changes are persistent */
309
310         old_r = rcu_dereference_protected(c->replicas,
311                                           lockdep_is_held(&c->sb_lock));
312
313         rcu_assign_pointer(c->replicas, new_r);
314         kfree_rcu(old_r, rcu);
315 out:
316         mutex_unlock(&c->sb_lock);
317         return ret;
318 err:
319         kfree_rcu(new_r, rcu);
320         goto out;
321 }
322
323 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
324 {
325         struct bch_replicas_cpu *dst, *src;
326         struct bch_replicas_entry *e;
327
328         lockdep_assert_held(&c->replicas_gc_lock);
329
330         mutex_lock(&c->sb_lock);
331         BUG_ON(c->replicas_gc);
332
333         src = rcu_dereference_protected(c->replicas,
334                                         lockdep_is_held(&c->sb_lock));
335
336         dst = kzalloc(sizeof(struct bch_replicas_cpu) +
337                       src->nr * src->entry_size, GFP_NOIO);
338         if (!dst) {
339                 mutex_unlock(&c->sb_lock);
340                 return -ENOMEM;
341         }
342
343         dst->nr         = 0;
344         dst->entry_size = src->entry_size;
345
346         for_each_cpu_replicas_entry(src, e)
347                 if (!((1 << e->data_type) & typemask))
348                         memcpy(cpu_replicas_entry(dst, dst->nr++),
349                                e, dst->entry_size);
350
351         bch2_cpu_replicas_sort(dst);
352
353         rcu_assign_pointer(c->replicas_gc, dst);
354         mutex_unlock(&c->sb_lock);
355
356         return 0;
357 }
358
359 /* Replicas tracking - superblock: */
360
361 static struct bch_replicas_cpu *
362 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
363 {
364         struct bch_replicas_entry *e, *dst;
365         struct bch_replicas_cpu *cpu_r;
366         unsigned nr = 0, entry_size = 0;
367
368         if (sb_r)
369                 for_each_replicas_entry(sb_r, e) {
370                         entry_size = max_t(unsigned, entry_size,
371                                            replicas_entry_bytes(e));
372                         nr++;
373                 }
374
375         cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
376                         nr * entry_size, GFP_NOIO);
377         if (!cpu_r)
378                 return NULL;
379
380         cpu_r->nr               = nr;
381         cpu_r->entry_size       = entry_size;
382
383         nr = 0;
384
385         if (sb_r)
386                 for_each_replicas_entry(sb_r, e) {
387                         dst = cpu_replicas_entry(cpu_r, nr++);
388                         memcpy(dst, e, replicas_entry_bytes(e));
389                         replicas_entry_sort(dst);
390                 }
391
392         bch2_cpu_replicas_sort(cpu_r);
393         return cpu_r;
394 }
395
396 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
397 {
398         struct bch_sb_field_replicas *sb_r;
399         struct bch_replicas_cpu *cpu_r, *old_r;
400
401         sb_r    = bch2_sb_get_replicas(c->disk_sb.sb);
402         cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
403         if (!cpu_r)
404                 return -ENOMEM;
405
406         old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
407         rcu_assign_pointer(c->replicas, cpu_r);
408         if (old_r)
409                 kfree_rcu(old_r, rcu);
410
411         return 0;
412 }
413
414 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
415                                             struct bch_replicas_cpu *r)
416 {
417         struct bch_sb_field_replicas *sb_r;
418         struct bch_replicas_entry *dst, *src;
419         size_t bytes;
420
421         bytes = sizeof(struct bch_sb_field_replicas);
422
423         for_each_cpu_replicas_entry(r, src)
424                 bytes += replicas_entry_bytes(src);
425
426         sb_r = bch2_sb_resize_replicas(&c->disk_sb,
427                         DIV_ROUND_UP(bytes, sizeof(u64)));
428         if (!sb_r)
429                 return -ENOSPC;
430
431         memset(&sb_r->entries, 0,
432                vstruct_end(&sb_r->field) -
433                (void *) &sb_r->entries);
434
435         dst = sb_r->entries;
436         for_each_cpu_replicas_entry(r, src) {
437                 memcpy(dst, src, replicas_entry_bytes(src));
438
439                 dst = replicas_entry_next(dst);
440
441                 BUG_ON((void *) dst > vstruct_end(&sb_r->field));
442         }
443
444         return 0;
445 }
446
447 static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
448 {
449         unsigned i;
450
451         sort_cmp_size(cpu_r->entries,
452                       cpu_r->nr,
453                       cpu_r->entry_size,
454                       memcmp, NULL);
455
456         for (i = 0; i + 1 < cpu_r->nr; i++) {
457                 struct bch_replicas_entry *l =
458                         cpu_replicas_entry(cpu_r, i);
459                 struct bch_replicas_entry *r =
460                         cpu_replicas_entry(cpu_r, i + 1);
461
462                 BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
463
464                 if (!memcmp(l, r, cpu_r->entry_size))
465                         return "duplicate replicas entry";
466         }
467
468         return NULL;
469 }
470
471 static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
472 {
473         struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
474         struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
475         struct bch_replicas_cpu *cpu_r = NULL;
476         struct bch_replicas_entry *e;
477         const char *err;
478         unsigned i;
479
480         for_each_replicas_entry(sb_r, e) {
481                 err = "invalid replicas entry: invalid data type";
482                 if (e->data_type >= BCH_DATA_NR)
483                         goto err;
484
485                 err = "invalid replicas entry: no devices";
486                 if (!e->nr_devs)
487                         goto err;
488
489                 err = "invalid replicas entry: too many devices";
490                 if (e->nr_devs >= BCH_REPLICAS_MAX)
491                         goto err;
492
493                 err = "invalid replicas entry: invalid device";
494                 for (i = 0; i < e->nr_devs; i++)
495                         if (!bch2_dev_exists(sb, mi, e->devs[i]))
496                                 goto err;
497         }
498
499         err = "cannot allocate memory";
500         cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
501         if (!cpu_r)
502                 goto err;
503
504         err = check_dup_replicas_entries(cpu_r);
505 err:
506         kfree(cpu_r);
507         return err;
508 }
509
510 const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
511         .validate       = bch2_sb_validate_replicas,
512 };
513
514 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
515 {
516         char *out = buf, *end = out + size;
517         struct bch_replicas_entry *e;
518         bool first = true;
519
520         if (!r) {
521                 out += scnprintf(out, end - out, "(no replicas section found)");
522                 return out - buf;
523         }
524
525         for_each_replicas_entry(r, e) {
526                 if (!first)
527                         out += scnprintf(out, end - out, " ");
528                 first = false;
529
530                 out += replicas_entry_to_text(e, out, end - out);
531         }
532
533         return out - buf;
534 }
535
536 /* Query replicas: */
537
538 bool bch2_replicas_marked(struct bch_fs *c,
539                           enum bch_data_type data_type,
540                           struct bch_devs_list devs)
541 {
542         struct bch_replicas_entry_padded search;
543         bool ret;
544
545         if (!devs.nr)
546                 return true;
547
548         memset(&search, 0, sizeof(search));
549
550         devlist_to_replicas(devs, data_type, &search.e);
551
552         rcu_read_lock();
553         ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
554         rcu_read_unlock();
555
556         return ret;
557 }
558
559 bool bch2_bkey_replicas_marked(struct bch_fs *c,
560                                enum bkey_type type,
561                                struct bkey_s_c k)
562 {
563         struct bch_replicas_entry_padded search;
564         bool ret;
565
566         if (type == BKEY_TYPE_EXTENTS) {
567                 struct bch_devs_list cached = bch2_bkey_cached_devs(k);
568                 unsigned i;
569
570                 for (i = 0; i < cached.nr; i++)
571                         if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
572                                         bch2_dev_list_single(cached.devs[i])))
573                                 return false;
574         }
575
576         bkey_to_replicas(type, k, &search.e);
577
578         if (!search.e.nr_devs)
579                 return true;
580
581         rcu_read_lock();
582         ret = replicas_has_entry(rcu_dereference(c->replicas), &search.e);
583         rcu_read_unlock();
584
585         return ret;
586 }
587
588 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
589                                               struct bch_devs_mask online_devs)
590 {
591         struct bch_sb_field_members *mi;
592         struct bch_replicas_entry *e;
593         struct bch_replicas_cpu *r;
594         unsigned i, nr_online, nr_offline;
595         struct replicas_status ret;
596
597         memset(&ret, 0, sizeof(ret));
598
599         for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
600                 ret.replicas[i].nr_online = UINT_MAX;
601
602         mi = bch2_sb_get_members(c->disk_sb.sb);
603         rcu_read_lock();
604         r = rcu_dereference(c->replicas);
605
606         for_each_cpu_replicas_entry(r, e) {
607                 if (e->data_type >= ARRAY_SIZE(ret.replicas))
608                         panic("e %p data_type %u\n", e, e->data_type);
609
610                 nr_online = nr_offline = 0;
611
612                 for (i = 0; i < e->nr_devs; i++) {
613                         BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi,
614                                                 e->devs[i]));
615
616                         if (test_bit(e->devs[i], online_devs.d))
617                                 nr_online++;
618                         else
619                                 nr_offline++;
620                 }
621
622                 ret.replicas[e->data_type].nr_online =
623                         min(ret.replicas[e->data_type].nr_online,
624                             nr_online);
625
626                 ret.replicas[e->data_type].nr_offline =
627                         max(ret.replicas[e->data_type].nr_offline,
628                             nr_offline);
629         }
630
631         rcu_read_unlock();
632
633         return ret;
634 }
635
636 struct replicas_status bch2_replicas_status(struct bch_fs *c)
637 {
638         return __bch2_replicas_status(c, bch2_online_devs(c));
639 }
640
641 static bool have_enough_devs(struct replicas_status s,
642                              enum bch_data_type type,
643                              bool force_if_degraded,
644                              bool force_if_lost)
645 {
646         return (!s.replicas[type].nr_offline || force_if_degraded) &&
647                 (s.replicas[type].nr_online || force_if_lost);
648 }
649
650 bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
651 {
652         return (have_enough_devs(s, BCH_DATA_JOURNAL,
653                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
654                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
655                 have_enough_devs(s, BCH_DATA_BTREE,
656                                  flags & BCH_FORCE_IF_METADATA_DEGRADED,
657                                  flags & BCH_FORCE_IF_METADATA_LOST) &&
658                 have_enough_devs(s, BCH_DATA_USER,
659                                  flags & BCH_FORCE_IF_DATA_DEGRADED,
660                                  flags & BCH_FORCE_IF_DATA_LOST));
661 }
662
663 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
664 {
665         struct replicas_status s = bch2_replicas_status(c);
666
667         return meta
668                 ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
669                       s.replicas[BCH_DATA_BTREE].nr_online)
670                 : s.replicas[BCH_DATA_USER].nr_online;
671 }
672
673 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
674 {
675         struct bch_replicas_entry *e;
676         struct bch_replicas_cpu *r;
677         unsigned i, ret = 0;
678
679         rcu_read_lock();
680         r = rcu_dereference(c->replicas);
681
682         for_each_cpu_replicas_entry(r, e)
683                 for (i = 0; i < e->nr_devs; i++)
684                         if (e->devs[i] == ca->dev_idx)
685                                 ret |= 1 << e->data_type;
686
687         rcu_read_unlock();
688
689         return ret;
690 }