]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/buckets.c
Update bcachefs sources to 72740a707b64 bcachefs: Split brain detection
[bcachefs-tools-debian] / libbcachefs / buckets.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code for manipulating bucket marks for garbage collection.
4  *
5  * Copyright 2014 Datera, Inc.
6  */
7
8 #include "bcachefs.h"
9 #include "alloc_background.h"
10 #include "backpointers.h"
11 #include "bset.h"
12 #include "btree_gc.h"
13 #include "btree_update.h"
14 #include "buckets.h"
15 #include "buckets_waiting_for_journal.h"
16 #include "ec.h"
17 #include "error.h"
18 #include "inode.h"
19 #include "movinggc.h"
20 #include "recovery.h"
21 #include "reflink.h"
22 #include "replicas.h"
23 #include "subvolume.h"
24 #include "trace.h"
25
26 #include <linux/preempt.h>
27
28 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
29                                               enum bch_data_type data_type,
30                                               s64 sectors)
31 {
32         switch (data_type) {
33         case BCH_DATA_btree:
34                 fs_usage->btree         += sectors;
35                 break;
36         case BCH_DATA_user:
37         case BCH_DATA_parity:
38                 fs_usage->data          += sectors;
39                 break;
40         case BCH_DATA_cached:
41                 fs_usage->cached        += sectors;
42                 break;
43         default:
44                 break;
45         }
46 }
47
48 void bch2_fs_usage_initialize(struct bch_fs *c)
49 {
50         percpu_down_write(&c->mark_lock);
51         struct bch_fs_usage *usage = c->usage_base;
52
53         for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
54                 bch2_fs_usage_acc_to_base(c, i);
55
56         for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
57                 usage->reserved += usage->persistent_reserved[i];
58
59         for (unsigned i = 0; i < c->replicas.nr; i++) {
60                 struct bch_replicas_entry_v1 *e =
61                         cpu_replicas_entry(&c->replicas, i);
62
63                 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
64         }
65
66         for_each_member_device(c, ca) {
67                 struct bch_dev_usage dev = bch2_dev_usage_read(ca);
68
69                 usage->hidden += (dev.d[BCH_DATA_sb].buckets +
70                                   dev.d[BCH_DATA_journal].buckets) *
71                         ca->mi.bucket_size;
72         }
73
74         percpu_up_write(&c->mark_lock);
75 }
76
77 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
78                                                   unsigned journal_seq,
79                                                   bool gc)
80 {
81         BUG_ON(!gc && !journal_seq);
82
83         return this_cpu_ptr(gc
84                             ? ca->usage_gc
85                             : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
86 }
87
88 void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
89 {
90         struct bch_fs *c = ca->fs;
91         unsigned seq, i, u64s = dev_usage_u64s();
92
93         do {
94                 seq = read_seqcount_begin(&c->usage_lock);
95                 memcpy(usage, ca->usage_base, u64s * sizeof(u64));
96                 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
97                         acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s);
98         } while (read_seqcount_retry(&c->usage_lock, seq));
99 }
100
101 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
102 {
103         ssize_t offset = v - (u64 *) c->usage_base;
104         unsigned i, seq;
105         u64 ret;
106
107         BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
108         percpu_rwsem_assert_held(&c->mark_lock);
109
110         do {
111                 seq = read_seqcount_begin(&c->usage_lock);
112                 ret = *v;
113
114                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
115                         ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
116         } while (read_seqcount_retry(&c->usage_lock, seq));
117
118         return ret;
119 }
120
121 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
122 {
123         struct bch_fs_usage_online *ret;
124         unsigned nr_replicas = READ_ONCE(c->replicas.nr);
125         unsigned seq, i;
126 retry:
127         ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL);
128         if (unlikely(!ret))
129                 return NULL;
130
131         percpu_down_read(&c->mark_lock);
132
133         if (nr_replicas != c->replicas.nr) {
134                 nr_replicas = c->replicas.nr;
135                 percpu_up_read(&c->mark_lock);
136                 kfree(ret);
137                 goto retry;
138         }
139
140         ret->online_reserved = percpu_u64_get(c->online_reserved);
141
142         do {
143                 seq = read_seqcount_begin(&c->usage_lock);
144                 unsafe_memcpy(&ret->u, c->usage_base,
145                               __fs_usage_u64s(nr_replicas) * sizeof(u64),
146                               "embedded variable length struct");
147                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
148                         acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
149                                         __fs_usage_u64s(nr_replicas));
150         } while (read_seqcount_retry(&c->usage_lock, seq));
151
152         return ret;
153 }
154
155 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
156 {
157         unsigned u64s = fs_usage_u64s(c);
158
159         BUG_ON(idx >= ARRAY_SIZE(c->usage));
160
161         preempt_disable();
162         write_seqcount_begin(&c->usage_lock);
163
164         acc_u64s_percpu((u64 *) c->usage_base,
165                         (u64 __percpu *) c->usage[idx], u64s);
166         percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
167
168         rcu_read_lock();
169         for_each_member_device_rcu(c, ca, NULL) {
170                 u64s = dev_usage_u64s();
171
172                 acc_u64s_percpu((u64 *) ca->usage_base,
173                                 (u64 __percpu *) ca->usage[idx], u64s);
174                 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
175         }
176         rcu_read_unlock();
177
178         write_seqcount_end(&c->usage_lock);
179         preempt_enable();
180 }
181
182 void bch2_fs_usage_to_text(struct printbuf *out,
183                            struct bch_fs *c,
184                            struct bch_fs_usage_online *fs_usage)
185 {
186         unsigned i;
187
188         prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
189
190         prt_printf(out, "hidden:\t\t\t\t%llu\n",
191                fs_usage->u.hidden);
192         prt_printf(out, "data:\t\t\t\t%llu\n",
193                fs_usage->u.data);
194         prt_printf(out, "cached:\t\t\t\t%llu\n",
195                fs_usage->u.cached);
196         prt_printf(out, "reserved:\t\t\t%llu\n",
197                fs_usage->u.reserved);
198         prt_printf(out, "nr_inodes:\t\t\t%llu\n",
199                fs_usage->u.nr_inodes);
200         prt_printf(out, "online reserved:\t\t%llu\n",
201                fs_usage->online_reserved);
202
203         for (i = 0;
204              i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
205              i++) {
206                 prt_printf(out, "%u replicas:\n", i + 1);
207                 prt_printf(out, "\treserved:\t\t%llu\n",
208                        fs_usage->u.persistent_reserved[i]);
209         }
210
211         for (i = 0; i < c->replicas.nr; i++) {
212                 struct bch_replicas_entry_v1 *e =
213                         cpu_replicas_entry(&c->replicas, i);
214
215                 prt_printf(out, "\t");
216                 bch2_replicas_entry_to_text(out, e);
217                 prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
218         }
219 }
220
221 static u64 reserve_factor(u64 r)
222 {
223         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
224 }
225
226 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
227 {
228         return min(fs_usage->u.hidden +
229                    fs_usage->u.btree +
230                    fs_usage->u.data +
231                    reserve_factor(fs_usage->u.reserved +
232                                   fs_usage->online_reserved),
233                    c->capacity);
234 }
235
236 static struct bch_fs_usage_short
237 __bch2_fs_usage_read_short(struct bch_fs *c)
238 {
239         struct bch_fs_usage_short ret;
240         u64 data, reserved;
241
242         ret.capacity = c->capacity -
243                 bch2_fs_usage_read_one(c, &c->usage_base->hidden);
244
245         data            = bch2_fs_usage_read_one(c, &c->usage_base->data) +
246                 bch2_fs_usage_read_one(c, &c->usage_base->btree);
247         reserved        = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
248                 percpu_u64_get(c->online_reserved);
249
250         ret.used        = min(ret.capacity, data + reserve_factor(reserved));
251         ret.free        = ret.capacity - ret.used;
252
253         ret.nr_inodes   = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
254
255         return ret;
256 }
257
258 struct bch_fs_usage_short
259 bch2_fs_usage_read_short(struct bch_fs *c)
260 {
261         struct bch_fs_usage_short ret;
262
263         percpu_down_read(&c->mark_lock);
264         ret = __bch2_fs_usage_read_short(c);
265         percpu_up_read(&c->mark_lock);
266
267         return ret;
268 }
269
270 void bch2_dev_usage_init(struct bch_dev *ca)
271 {
272         ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
273 }
274
275 void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
276 {
277         prt_tab(out);
278         prt_str(out, "buckets");
279         prt_tab_rjust(out);
280         prt_str(out, "sectors");
281         prt_tab_rjust(out);
282         prt_str(out, "fragmented");
283         prt_tab_rjust(out);
284         prt_newline(out);
285
286         for (unsigned i = 0; i < BCH_DATA_NR; i++) {
287                 prt_str(out, bch2_data_types[i]);
288                 prt_tab(out);
289                 prt_u64(out, usage->d[i].buckets);
290                 prt_tab_rjust(out);
291                 prt_u64(out, usage->d[i].sectors);
292                 prt_tab_rjust(out);
293                 prt_u64(out, usage->d[i].fragmented);
294                 prt_tab_rjust(out);
295                 prt_newline(out);
296         }
297 }
298
299 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
300                                   struct bch_alloc_v4 old,
301                                   struct bch_alloc_v4 new,
302                                   u64 journal_seq, bool gc)
303 {
304         struct bch_fs_usage *fs_usage;
305         struct bch_dev_usage *u;
306
307         preempt_disable();
308         fs_usage = fs_usage_ptr(c, journal_seq, gc);
309
310         if (data_type_is_hidden(old.data_type))
311                 fs_usage->hidden -= ca->mi.bucket_size;
312         if (data_type_is_hidden(new.data_type))
313                 fs_usage->hidden += ca->mi.bucket_size;
314
315         u = dev_usage_ptr(ca, journal_seq, gc);
316
317         u->d[old.data_type].buckets--;
318         u->d[new.data_type].buckets++;
319
320         u->d[old.data_type].sectors -= bch2_bucket_sectors_dirty(old);
321         u->d[new.data_type].sectors += bch2_bucket_sectors_dirty(new);
322
323         u->d[BCH_DATA_cached].sectors += new.cached_sectors;
324         u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
325
326         u->d[old.data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, old);
327         u->d[new.data_type].fragmented += bch2_bucket_sectors_fragmented(ca, new);
328
329         preempt_enable();
330 }
331
332 static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
333 {
334         return (struct bch_alloc_v4) {
335                 .gen            = b.gen,
336                 .data_type      = b.data_type,
337                 .dirty_sectors  = b.dirty_sectors,
338                 .cached_sectors = b.cached_sectors,
339                 .stripe         = b.stripe,
340         };
341 }
342
343 static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
344                                     struct bucket old, struct bucket new)
345 {
346         bch2_dev_usage_update(c, ca,
347                               bucket_m_to_alloc(old),
348                               bucket_m_to_alloc(new),
349                               0, true);
350 }
351
352 static inline int __update_replicas(struct bch_fs *c,
353                                     struct bch_fs_usage *fs_usage,
354                                     struct bch_replicas_entry_v1 *r,
355                                     s64 sectors)
356 {
357         int idx = bch2_replicas_entry_idx(c, r);
358
359         if (idx < 0)
360                 return -1;
361
362         fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
363         fs_usage->replicas[idx]         += sectors;
364         return 0;
365 }
366
367 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
368                         struct bch_replicas_entry_v1 *r, s64 sectors,
369                         unsigned journal_seq, bool gc)
370 {
371         struct bch_fs_usage *fs_usage;
372         int idx, ret = 0;
373         struct printbuf buf = PRINTBUF;
374
375         percpu_down_read(&c->mark_lock);
376
377         idx = bch2_replicas_entry_idx(c, r);
378         if (idx < 0 &&
379             fsck_err(c, ptr_to_missing_replicas_entry,
380                      "no replicas entry\n  while marking %s",
381                      (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
382                 percpu_up_read(&c->mark_lock);
383                 ret = bch2_mark_replicas(c, r);
384                 percpu_down_read(&c->mark_lock);
385
386                 if (ret)
387                         goto err;
388                 idx = bch2_replicas_entry_idx(c, r);
389         }
390         if (idx < 0) {
391                 ret = -1;
392                 goto err;
393         }
394
395         preempt_disable();
396         fs_usage = fs_usage_ptr(c, journal_seq, gc);
397         fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
398         fs_usage->replicas[idx]         += sectors;
399         preempt_enable();
400 err:
401 fsck_err:
402         percpu_up_read(&c->mark_lock);
403         printbuf_exit(&buf);
404         return ret;
405 }
406
407 static inline int update_cached_sectors(struct bch_fs *c,
408                         struct bkey_s_c k,
409                         unsigned dev, s64 sectors,
410                         unsigned journal_seq, bool gc)
411 {
412         struct bch_replicas_padded r;
413
414         bch2_replicas_entry_cached(&r.e, dev);
415
416         return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
417 }
418
419 static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
420                                      gfp_t gfp)
421 {
422         struct replicas_delta_list *d = trans->fs_usage_deltas;
423         unsigned new_size = d ? (d->size + more) * 2 : 128;
424         unsigned alloc_size = sizeof(*d) + new_size;
425
426         WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
427
428         if (!d || d->used + more > d->size) {
429                 d = krealloc(d, alloc_size, gfp|__GFP_ZERO);
430
431                 if (unlikely(!d)) {
432                         if (alloc_size > REPLICAS_DELTA_LIST_MAX)
433                                 return -ENOMEM;
434
435                         d = mempool_alloc(&trans->c->replicas_delta_pool, gfp);
436                         if (!d)
437                                 return -ENOMEM;
438
439                         memset(d, 0, REPLICAS_DELTA_LIST_MAX);
440
441                         if (trans->fs_usage_deltas)
442                                 memcpy(d, trans->fs_usage_deltas,
443                                        trans->fs_usage_deltas->size + sizeof(*d));
444
445                         new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
446                         kfree(trans->fs_usage_deltas);
447                 }
448
449                 d->size = new_size;
450                 trans->fs_usage_deltas = d;
451         }
452
453         return 0;
454 }
455
456 int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
457 {
458         return allocate_dropping_locks_errcode(trans,
459                                 __replicas_deltas_realloc(trans, more, _gfp));
460 }
461
462 int bch2_update_replicas_list(struct btree_trans *trans,
463                          struct bch_replicas_entry_v1 *r,
464                          s64 sectors)
465 {
466         struct replicas_delta_list *d;
467         struct replicas_delta *n;
468         unsigned b;
469         int ret;
470
471         if (!sectors)
472                 return 0;
473
474         b = replicas_entry_bytes(r) + 8;
475         ret = bch2_replicas_deltas_realloc(trans, b);
476         if (ret)
477                 return ret;
478
479         d = trans->fs_usage_deltas;
480         n = (void *) d->d + d->used;
481         n->delta = sectors;
482         unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
483                       r, replicas_entry_bytes(r),
484                       "flexible array member embedded in strcuct with padding");
485         bch2_replicas_entry_sort(&n->r);
486         d->used += b;
487         return 0;
488 }
489
490 int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
491 {
492         struct bch_replicas_padded r;
493
494         bch2_replicas_entry_cached(&r.e, dev);
495
496         return bch2_update_replicas_list(trans, &r.e, sectors);
497 }
498
499 int bch2_mark_alloc(struct btree_trans *trans,
500                     enum btree_id btree, unsigned level,
501                     struct bkey_s_c old, struct bkey_s_c new,
502                     unsigned flags)
503 {
504         bool gc = flags & BTREE_TRIGGER_GC;
505         u64 journal_seq = trans->journal_res.seq;
506         u64 bucket_journal_seq;
507         struct bch_fs *c = trans->c;
508         struct bch_alloc_v4 old_a_convert, new_a_convert;
509         const struct bch_alloc_v4 *old_a, *new_a;
510         struct bch_dev *ca;
511         int ret = 0;
512
513         /*
514          * alloc btree is read in by bch2_alloc_read, not gc:
515          */
516         if ((flags & BTREE_TRIGGER_GC) &&
517             !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
518                 return 0;
519
520         if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
521                                        "alloc key for invalid device or bucket"))
522                 return -EIO;
523
524         ca = bch_dev_bkey_exists(c, new.k->p.inode);
525
526         old_a = bch2_alloc_to_v4(old, &old_a_convert);
527         new_a = bch2_alloc_to_v4(new, &new_a_convert);
528
529         bucket_journal_seq = new_a->journal_seq;
530
531         if ((flags & BTREE_TRIGGER_INSERT) &&
532             data_type_is_empty(old_a->data_type) !=
533             data_type_is_empty(new_a->data_type) &&
534             new.k->type == KEY_TYPE_alloc_v4) {
535                 struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
536
537                 EBUG_ON(!journal_seq);
538
539                 /*
540                  * If the btree updates referring to a bucket weren't flushed
541                  * before the bucket became empty again, then the we don't have
542                  * to wait on a journal flush before we can reuse the bucket:
543                  */
544                 v->journal_seq = bucket_journal_seq =
545                         data_type_is_empty(new_a->data_type) &&
546                         (journal_seq == v->journal_seq ||
547                          bch2_journal_noflush_seq(&c->journal, v->journal_seq))
548                         ? 0 : journal_seq;
549         }
550
551         if (!data_type_is_empty(old_a->data_type) &&
552             data_type_is_empty(new_a->data_type) &&
553             bucket_journal_seq) {
554                 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
555                                 c->journal.flushed_seq_ondisk,
556                                 new.k->p.inode, new.k->p.offset,
557                                 bucket_journal_seq);
558                 if (ret) {
559                         bch2_fs_fatal_error(c,
560                                 "error setting bucket_needs_journal_commit: %i", ret);
561                         return ret;
562                 }
563         }
564
565         percpu_down_read(&c->mark_lock);
566         if (!gc && new_a->gen != old_a->gen)
567                 *bucket_gen(ca, new.k->p.offset) = new_a->gen;
568
569         bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
570
571         if (gc) {
572                 struct bucket *g = gc_bucket(ca, new.k->p.offset);
573
574                 bucket_lock(g);
575
576                 g->gen_valid            = 1;
577                 g->gen                  = new_a->gen;
578                 g->data_type            = new_a->data_type;
579                 g->stripe               = new_a->stripe;
580                 g->stripe_redundancy    = new_a->stripe_redundancy;
581                 g->dirty_sectors        = new_a->dirty_sectors;
582                 g->cached_sectors       = new_a->cached_sectors;
583
584                 bucket_unlock(g);
585         }
586         percpu_up_read(&c->mark_lock);
587
588         if (new_a->data_type == BCH_DATA_free &&
589             (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
590                 closure_wake_up(&c->freelist_wait);
591
592         if (new_a->data_type == BCH_DATA_need_discard &&
593             (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
594                 bch2_do_discards(c);
595
596         if (old_a->data_type != BCH_DATA_cached &&
597             new_a->data_type == BCH_DATA_cached &&
598             should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
599                 bch2_do_invalidates(c);
600
601         if (new_a->data_type == BCH_DATA_need_gc_gens)
602                 bch2_do_gc_gens(c);
603
604         return 0;
605 }
606
607 int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
608                               size_t b, enum bch_data_type data_type,
609                               unsigned sectors, struct gc_pos pos,
610                               unsigned flags)
611 {
612         struct bucket old, new, *g;
613         int ret = 0;
614
615         BUG_ON(!(flags & BTREE_TRIGGER_GC));
616         BUG_ON(data_type != BCH_DATA_sb &&
617                data_type != BCH_DATA_journal);
618
619         /*
620          * Backup superblock might be past the end of our normal usable space:
621          */
622         if (b >= ca->mi.nbuckets)
623                 return 0;
624
625         percpu_down_read(&c->mark_lock);
626         g = gc_bucket(ca, b);
627
628         bucket_lock(g);
629         old = *g;
630
631         if (bch2_fs_inconsistent_on(g->data_type &&
632                         g->data_type != data_type, c,
633                         "different types of data in same bucket: %s, %s",
634                         bch2_data_types[g->data_type],
635                         bch2_data_types[data_type])) {
636                 ret = -EIO;
637                 goto err;
638         }
639
640         if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
641                         "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
642                         ca->dev_idx, b, g->gen,
643                         bch2_data_types[g->data_type ?: data_type],
644                         g->dirty_sectors, sectors)) {
645                 ret = -EIO;
646                 goto err;
647         }
648
649         g->data_type = data_type;
650         g->dirty_sectors += sectors;
651         new = *g;
652 err:
653         bucket_unlock(g);
654         if (!ret)
655                 bch2_dev_usage_update_m(c, ca, old, new);
656         percpu_up_read(&c->mark_lock);
657         return ret;
658 }
659
660 static int check_bucket_ref(struct btree_trans *trans,
661                             struct bkey_s_c k,
662                             const struct bch_extent_ptr *ptr,
663                             s64 sectors, enum bch_data_type ptr_data_type,
664                             u8 b_gen, u8 bucket_data_type,
665                             u32 bucket_sectors)
666 {
667         struct bch_fs *c = trans->c;
668         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
669         size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
670         struct printbuf buf = PRINTBUF;
671         int ret = 0;
672
673         if (bucket_data_type == BCH_DATA_cached)
674                 bucket_data_type = BCH_DATA_user;
675
676         if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) ||
677             (bucket_data_type == BCH_DATA_user   && ptr_data_type == BCH_DATA_stripe))
678                 bucket_data_type = ptr_data_type = BCH_DATA_stripe;
679
680         if (gen_after(ptr->gen, b_gen)) {
681                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
682                               BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
683                         "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
684                         "while marking %s",
685                         ptr->dev, bucket_nr, b_gen,
686                         bch2_data_types[bucket_data_type ?: ptr_data_type],
687                         ptr->gen,
688                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
689                 ret = -EIO;
690                 goto err;
691         }
692
693         if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
694                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
695                               BCH_FSCK_ERR_ptr_too_stale,
696                         "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
697                         "while marking %s",
698                         ptr->dev, bucket_nr, b_gen,
699                         bch2_data_types[bucket_data_type ?: ptr_data_type],
700                         ptr->gen,
701                         (printbuf_reset(&buf),
702                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
703                 ret = -EIO;
704                 goto err;
705         }
706
707         if (b_gen != ptr->gen && !ptr->cached) {
708                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
709                               BCH_FSCK_ERR_stale_dirty_ptr,
710                         "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
711                         "while marking %s",
712                         ptr->dev, bucket_nr, b_gen,
713                         *bucket_gen(ca, bucket_nr),
714                         bch2_data_types[bucket_data_type ?: ptr_data_type],
715                         ptr->gen,
716                         (printbuf_reset(&buf),
717                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
718                 ret = -EIO;
719                 goto err;
720         }
721
722         if (b_gen != ptr->gen) {
723                 ret = 1;
724                 goto out;
725         }
726
727         if (!data_type_is_empty(bucket_data_type) &&
728             ptr_data_type &&
729             bucket_data_type != ptr_data_type) {
730                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
731                               BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
732                         "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
733                         "while marking %s",
734                         ptr->dev, bucket_nr, b_gen,
735                         bch2_data_types[bucket_data_type],
736                         bch2_data_types[ptr_data_type],
737                         (printbuf_reset(&buf),
738                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
739                 ret = -EIO;
740                 goto err;
741         }
742
743         if ((u64) bucket_sectors + sectors > U32_MAX) {
744                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
745                               BCH_FSCK_ERR_bucket_sector_count_overflow,
746                         "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
747                         "while marking %s",
748                         ptr->dev, bucket_nr, b_gen,
749                         bch2_data_types[bucket_data_type ?: ptr_data_type],
750                         bucket_sectors, sectors,
751                         (printbuf_reset(&buf),
752                          bch2_bkey_val_to_text(&buf, c, k), buf.buf));
753                 ret = -EIO;
754                 goto err;
755         }
756 out:
757         printbuf_exit(&buf);
758         return ret;
759 err:
760         bch2_dump_trans_updates(trans);
761         goto out;
762 }
763
764 static int mark_stripe_bucket(struct btree_trans *trans,
765                               struct bkey_s_c k,
766                               unsigned ptr_idx,
767                               unsigned flags)
768 {
769         struct bch_fs *c = trans->c;
770         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
771         unsigned nr_data = s->nr_blocks - s->nr_redundant;
772         bool parity = ptr_idx >= nr_data;
773         enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
774         s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
775         const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
776         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
777         struct bucket old, new, *g;
778         struct printbuf buf = PRINTBUF;
779         int ret = 0;
780
781         BUG_ON(!(flags & BTREE_TRIGGER_GC));
782
783         /* * XXX doesn't handle deletion */
784
785         percpu_down_read(&c->mark_lock);
786         g = PTR_GC_BUCKET(ca, ptr);
787
788         if (g->dirty_sectors ||
789             (g->stripe && g->stripe != k.k->p.offset)) {
790                 bch2_fs_inconsistent(c,
791                               "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
792                               ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
793                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
794                 ret = -EINVAL;
795                 goto err;
796         }
797
798         bucket_lock(g);
799         old = *g;
800
801         ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
802                                g->gen, g->data_type,
803                                g->dirty_sectors);
804         if (ret)
805                 goto err;
806
807         g->data_type = data_type;
808         g->dirty_sectors += sectors;
809
810         g->stripe               = k.k->p.offset;
811         g->stripe_redundancy    = s->nr_redundant;
812         new = *g;
813 err:
814         bucket_unlock(g);
815         if (!ret)
816                 bch2_dev_usage_update_m(c, ca, old, new);
817         percpu_up_read(&c->mark_lock);
818         printbuf_exit(&buf);
819         return ret;
820 }
821
822 static int __mark_pointer(struct btree_trans *trans,
823                           struct bkey_s_c k,
824                           const struct bch_extent_ptr *ptr,
825                           s64 sectors, enum bch_data_type ptr_data_type,
826                           u8 bucket_gen, u8 *bucket_data_type,
827                           u32 *dirty_sectors, u32 *cached_sectors)
828 {
829         u32 *dst_sectors = !ptr->cached
830                 ? dirty_sectors
831                 : cached_sectors;
832         int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
833                                    bucket_gen, *bucket_data_type, *dst_sectors);
834
835         if (ret)
836                 return ret;
837
838         *dst_sectors += sectors;
839
840         if (!*dirty_sectors && !*cached_sectors)
841                 *bucket_data_type = 0;
842         else if (*bucket_data_type != BCH_DATA_stripe)
843                 *bucket_data_type = ptr_data_type;
844
845         return 0;
846 }
847
848 static int bch2_mark_pointer(struct btree_trans *trans,
849                              enum btree_id btree_id, unsigned level,
850                              struct bkey_s_c k,
851                              struct extent_ptr_decoded p,
852                              s64 sectors,
853                              unsigned flags)
854 {
855         struct bch_fs *c = trans->c;
856         struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
857         struct bucket old, new, *g;
858         enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
859         u8 bucket_data_type;
860         int ret = 0;
861
862         BUG_ON(!(flags & BTREE_TRIGGER_GC));
863
864         percpu_down_read(&c->mark_lock);
865         g = PTR_GC_BUCKET(ca, &p.ptr);
866         bucket_lock(g);
867         old = *g;
868
869         bucket_data_type = g->data_type;
870         ret = __mark_pointer(trans, k, &p.ptr, sectors,
871                              data_type, g->gen,
872                              &bucket_data_type,
873                              &g->dirty_sectors,
874                              &g->cached_sectors);
875         if (!ret)
876                 g->data_type = bucket_data_type;
877
878         new = *g;
879         bucket_unlock(g);
880         if (!ret)
881                 bch2_dev_usage_update_m(c, ca, old, new);
882         percpu_up_read(&c->mark_lock);
883
884         return ret;
885 }
886
887 static int bch2_mark_stripe_ptr(struct btree_trans *trans,
888                                 struct bkey_s_c k,
889                                 struct bch_extent_stripe_ptr p,
890                                 enum bch_data_type data_type,
891                                 s64 sectors,
892                                 unsigned flags)
893 {
894         struct bch_fs *c = trans->c;
895         struct bch_replicas_padded r;
896         struct gc_stripe *m;
897
898         BUG_ON(!(flags & BTREE_TRIGGER_GC));
899
900         m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
901         if (!m) {
902                 bch_err(c, "error allocating memory for gc_stripes, idx %llu",
903                         (u64) p.idx);
904                 return -BCH_ERR_ENOMEM_mark_stripe_ptr;
905         }
906
907         mutex_lock(&c->ec_stripes_heap_lock);
908
909         if (!m || !m->alive) {
910                 mutex_unlock(&c->ec_stripes_heap_lock);
911                 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
912                                     (u64) p.idx);
913                 bch2_inconsistent_error(c);
914                 return -EIO;
915         }
916
917         m->block_sectors[p.block] += sectors;
918
919         r = m->r;
920         mutex_unlock(&c->ec_stripes_heap_lock);
921
922         r.e.data_type = data_type;
923         update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
924
925         return 0;
926 }
927
928 static int __mark_extent(struct btree_trans *trans,
929                          enum btree_id btree_id, unsigned level,
930                          struct bkey_s_c k, unsigned flags)
931 {
932         u64 journal_seq = trans->journal_res.seq;
933         struct bch_fs *c = trans->c;
934         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
935         const union bch_extent_entry *entry;
936         struct extent_ptr_decoded p;
937         struct bch_replicas_padded r;
938         enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
939                 ? BCH_DATA_btree
940                 : BCH_DATA_user;
941         s64 sectors = bkey_is_btree_ptr(k.k)
942                 ? btree_sectors(c)
943                 : k.k->size;
944         s64 dirty_sectors = 0;
945         bool stale;
946         int ret;
947
948         BUG_ON(!(flags & BTREE_TRIGGER_GC));
949
950         r.e.data_type   = data_type;
951         r.e.nr_devs     = 0;
952         r.e.nr_required = 1;
953
954         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
955                 s64 disk_sectors = ptr_disk_sectors(sectors, p);
956
957                 if (flags & BTREE_TRIGGER_OVERWRITE)
958                         disk_sectors = -disk_sectors;
959
960                 ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
961                 if (ret < 0)
962                         return ret;
963
964                 stale = ret > 0;
965
966                 if (p.ptr.cached) {
967                         if (!stale) {
968                                 ret = update_cached_sectors(c, k, p.ptr.dev,
969                                                 disk_sectors, journal_seq, true);
970                                 if (ret) {
971                                         bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
972                                                             __func__);
973                                         return ret;
974                                 }
975                         }
976                 } else if (!p.has_ec) {
977                         dirty_sectors          += disk_sectors;
978                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
979                 } else {
980                         ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
981                                         disk_sectors, flags);
982                         if (ret)
983                                 return ret;
984
985                         /*
986                          * There may be other dirty pointers in this extent, but
987                          * if so they're not required for mounting if we have an
988                          * erasure coded pointer in this extent:
989                          */
990                         r.e.nr_required = 0;
991                 }
992         }
993
994         if (r.e.nr_devs) {
995                 ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
996                 if (ret) {
997                         struct printbuf buf = PRINTBUF;
998
999                         bch2_bkey_val_to_text(&buf, c, k);
1000                         bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
1001                         printbuf_exit(&buf);
1002                         return ret;
1003                 }
1004         }
1005
1006         return 0;
1007 }
1008
1009 int bch2_mark_extent(struct btree_trans *trans,
1010                      enum btree_id btree_id, unsigned level,
1011                      struct bkey_s_c old, struct bkey_s_c new,
1012                      unsigned flags)
1013 {
1014         return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
1015 }
1016
1017 int bch2_mark_stripe(struct btree_trans *trans,
1018                      enum btree_id btree_id, unsigned level,
1019                      struct bkey_s_c old, struct bkey_s_c new,
1020                      unsigned flags)
1021 {
1022         bool gc = flags & BTREE_TRIGGER_GC;
1023         u64 journal_seq = trans->journal_res.seq;
1024         struct bch_fs *c = trans->c;
1025         u64 idx = new.k->p.offset;
1026         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
1027                 ? bkey_s_c_to_stripe(old).v : NULL;
1028         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
1029                 ? bkey_s_c_to_stripe(new).v : NULL;
1030         unsigned i;
1031         int ret;
1032
1033         BUG_ON(gc && old_s);
1034
1035         if (!gc) {
1036                 struct stripe *m = genradix_ptr(&c->stripes, idx);
1037
1038                 if (!m) {
1039                         struct printbuf buf1 = PRINTBUF;
1040                         struct printbuf buf2 = PRINTBUF;
1041
1042                         bch2_bkey_val_to_text(&buf1, c, old);
1043                         bch2_bkey_val_to_text(&buf2, c, new);
1044                         bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
1045                                             "old %s\n"
1046                                             "new %s", idx, buf1.buf, buf2.buf);
1047                         printbuf_exit(&buf2);
1048                         printbuf_exit(&buf1);
1049                         bch2_inconsistent_error(c);
1050                         return -1;
1051                 }
1052
1053                 if (!new_s) {
1054                         bch2_stripes_heap_del(c, m, idx);
1055
1056                         memset(m, 0, sizeof(*m));
1057                 } else {
1058                         m->sectors      = le16_to_cpu(new_s->sectors);
1059                         m->algorithm    = new_s->algorithm;
1060                         m->nr_blocks    = new_s->nr_blocks;
1061                         m->nr_redundant = new_s->nr_redundant;
1062                         m->blocks_nonempty = 0;
1063
1064                         for (i = 0; i < new_s->nr_blocks; i++)
1065                                 m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
1066
1067                         if (!old_s)
1068                                 bch2_stripes_heap_insert(c, m, idx);
1069                         else
1070                                 bch2_stripes_heap_update(c, m, idx);
1071                 }
1072         } else {
1073                 struct gc_stripe *m =
1074                         genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
1075
1076                 if (!m) {
1077                         bch_err(c, "error allocating memory for gc_stripes, idx %llu",
1078                                 idx);
1079                         return -BCH_ERR_ENOMEM_mark_stripe;
1080                 }
1081                 /*
1082                  * This will be wrong when we bring back runtime gc: we should
1083                  * be unmarking the old key and then marking the new key
1084                  */
1085                 m->alive        = true;
1086                 m->sectors      = le16_to_cpu(new_s->sectors);
1087                 m->nr_blocks    = new_s->nr_blocks;
1088                 m->nr_redundant = new_s->nr_redundant;
1089
1090                 for (i = 0; i < new_s->nr_blocks; i++)
1091                         m->ptrs[i] = new_s->ptrs[i];
1092
1093                 bch2_bkey_to_replicas(&m->r.e, new);
1094
1095                 /*
1096                  * gc recalculates this field from stripe ptr
1097                  * references:
1098                  */
1099                 memset(m->block_sectors, 0, sizeof(m->block_sectors));
1100
1101                 for (i = 0; i < new_s->nr_blocks; i++) {
1102                         ret = mark_stripe_bucket(trans, new, i, flags);
1103                         if (ret)
1104                                 return ret;
1105                 }
1106
1107                 ret = update_replicas(c, new, &m->r.e,
1108                                       ((s64) m->sectors * m->nr_redundant),
1109                                       journal_seq, gc);
1110                 if (ret) {
1111                         struct printbuf buf = PRINTBUF;
1112
1113                         bch2_bkey_val_to_text(&buf, c, new);
1114                         bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
1115                         printbuf_exit(&buf);
1116                         return ret;
1117                 }
1118         }
1119
1120         return 0;
1121 }
1122
1123 static int __mark_reservation(struct btree_trans *trans,
1124                               enum btree_id btree_id, unsigned level,
1125                               struct bkey_s_c k, unsigned flags)
1126 {
1127         struct bch_fs *c = trans->c;
1128         struct bch_fs_usage *fs_usage;
1129         unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1130         s64 sectors = (s64) k.k->size;
1131
1132         BUG_ON(!(flags & BTREE_TRIGGER_GC));
1133
1134         if (flags & BTREE_TRIGGER_OVERWRITE)
1135                 sectors = -sectors;
1136         sectors *= replicas;
1137
1138         percpu_down_read(&c->mark_lock);
1139         preempt_disable();
1140
1141         fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
1142         replicas = clamp_t(unsigned, replicas, 1,
1143                            ARRAY_SIZE(fs_usage->persistent_reserved));
1144
1145         fs_usage->reserved                              += sectors;
1146         fs_usage->persistent_reserved[replicas - 1]     += sectors;
1147
1148         preempt_enable();
1149         percpu_up_read(&c->mark_lock);
1150
1151         return 0;
1152 }
1153
1154 int bch2_mark_reservation(struct btree_trans *trans,
1155                           enum btree_id btree_id, unsigned level,
1156                           struct bkey_s_c old, struct bkey_s_c new,
1157                           unsigned flags)
1158 {
1159         return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
1160 }
1161
1162 void bch2_trans_fs_usage_revert(struct btree_trans *trans,
1163                                 struct replicas_delta_list *deltas)
1164 {
1165         struct bch_fs *c = trans->c;
1166         struct bch_fs_usage *dst;
1167         struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
1168         s64 added = 0;
1169         unsigned i;
1170
1171         percpu_down_read(&c->mark_lock);
1172         preempt_disable();
1173         dst = fs_usage_ptr(c, trans->journal_res.seq, false);
1174
1175         /* revert changes: */
1176         for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
1177                 switch (d->r.data_type) {
1178                 case BCH_DATA_btree:
1179                 case BCH_DATA_user:
1180                 case BCH_DATA_parity:
1181                         added += d->delta;
1182                 }
1183                 BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
1184         }
1185
1186         dst->nr_inodes -= deltas->nr_inodes;
1187
1188         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
1189                 added                           -= deltas->persistent_reserved[i];
1190                 dst->reserved                   -= deltas->persistent_reserved[i];
1191                 dst->persistent_reserved[i]     -= deltas->persistent_reserved[i];
1192         }
1193
1194         if (added > 0) {
1195                 trans->disk_res->sectors += added;
1196                 this_cpu_add(*c->online_reserved, added);
1197         }
1198
1199         preempt_enable();
1200         percpu_up_read(&c->mark_lock);
1201 }
1202
1203 int bch2_trans_fs_usage_apply(struct btree_trans *trans,
1204                               struct replicas_delta_list *deltas)
1205 {
1206         struct bch_fs *c = trans->c;
1207         static int warned_disk_usage = 0;
1208         bool warn = false;
1209         u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
1210         struct replicas_delta *d, *d2;
1211         struct replicas_delta *top = (void *) deltas->d + deltas->used;
1212         struct bch_fs_usage *dst;
1213         s64 added = 0, should_not_have_added;
1214         unsigned i;
1215
1216         percpu_down_read(&c->mark_lock);
1217         preempt_disable();
1218         dst = fs_usage_ptr(c, trans->journal_res.seq, false);
1219
1220         for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
1221                 switch (d->r.data_type) {
1222                 case BCH_DATA_btree:
1223                 case BCH_DATA_user:
1224                 case BCH_DATA_parity:
1225                         added += d->delta;
1226                 }
1227
1228                 if (__update_replicas(c, dst, &d->r, d->delta))
1229                         goto need_mark;
1230         }
1231
1232         dst->nr_inodes += deltas->nr_inodes;
1233
1234         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
1235                 added                           += deltas->persistent_reserved[i];
1236                 dst->reserved                   += deltas->persistent_reserved[i];
1237                 dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
1238         }
1239
1240         /*
1241          * Not allowed to reduce sectors_available except by getting a
1242          * reservation:
1243          */
1244         should_not_have_added = added - (s64) disk_res_sectors;
1245         if (unlikely(should_not_have_added > 0)) {
1246                 u64 old, new, v = atomic64_read(&c->sectors_available);
1247
1248                 do {
1249                         old = v;
1250                         new = max_t(s64, 0, old - should_not_have_added);
1251                 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1252                                                old, new)) != old);
1253
1254                 added -= should_not_have_added;
1255                 warn = true;
1256         }
1257
1258         if (added > 0) {
1259                 trans->disk_res->sectors -= added;
1260                 this_cpu_sub(*c->online_reserved, added);
1261         }
1262
1263         preempt_enable();
1264         percpu_up_read(&c->mark_lock);
1265
1266         if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
1267                 bch2_trans_inconsistent(trans,
1268                                         "disk usage increased %lli more than %llu sectors reserved)",
1269                                         should_not_have_added, disk_res_sectors);
1270         return 0;
1271 need_mark:
1272         /* revert changes: */
1273         for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
1274                 BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
1275
1276         preempt_enable();
1277         percpu_up_read(&c->mark_lock);
1278         return -1;
1279 }
1280
1281 /* trans_mark: */
1282
1283 static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
1284                                    enum btree_id btree_id, unsigned level,
1285                                    struct bkey_s_c k, struct extent_ptr_decoded p,
1286                                    unsigned flags)
1287 {
1288         bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
1289         struct btree_iter iter;
1290         struct bkey_i_alloc_v4 *a;
1291         struct bpos bucket;
1292         struct bch_backpointer bp;
1293         s64 sectors;
1294         int ret;
1295
1296         bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
1297         sectors = bp.bucket_len;
1298         if (!insert)
1299                 sectors = -sectors;
1300
1301         a = bch2_trans_start_alloc_update(trans, &iter, bucket);
1302         if (IS_ERR(a))
1303                 return PTR_ERR(a);
1304
1305         ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
1306                              a->v.gen, &a->v.data_type,
1307                              &a->v.dirty_sectors, &a->v.cached_sectors) ?:
1308                 bch2_trans_update(trans, &iter, &a->k_i, 0);
1309         bch2_trans_iter_exit(trans, &iter);
1310
1311         if (ret)
1312                 return ret;
1313
1314         if (!p.ptr.cached) {
1315                 ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
1316                 if (ret)
1317                         return ret;
1318         }
1319
1320         return 0;
1321 }
1322
1323 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
1324                         struct extent_ptr_decoded p,
1325                         s64 sectors, enum bch_data_type data_type)
1326 {
1327         struct btree_iter iter;
1328         struct bkey_i_stripe *s;
1329         struct bch_replicas_padded r;
1330         int ret = 0;
1331
1332         s = bch2_bkey_get_mut_typed(trans, &iter,
1333                         BTREE_ID_stripes, POS(0, p.ec.idx),
1334                         BTREE_ITER_WITH_UPDATES, stripe);
1335         ret = PTR_ERR_OR_ZERO(s);
1336         if (unlikely(ret)) {
1337                 bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
1338                         "pointer to nonexistent stripe %llu",
1339                         (u64) p.ec.idx);
1340                 goto err;
1341         }
1342
1343         if (!bch2_ptr_matches_stripe(&s->v, p)) {
1344                 bch2_trans_inconsistent(trans,
1345                         "stripe pointer doesn't match stripe %llu",
1346                         (u64) p.ec.idx);
1347                 ret = -EIO;
1348                 goto err;
1349         }
1350
1351         stripe_blockcount_set(&s->v, p.ec.block,
1352                 stripe_blockcount_get(&s->v, p.ec.block) +
1353                 sectors);
1354
1355         bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
1356         r.e.data_type = data_type;
1357         ret = bch2_update_replicas_list(trans, &r.e, sectors);
1358 err:
1359         bch2_trans_iter_exit(trans, &iter);
1360         return ret;
1361 }
1362
1363 static int __trans_mark_extent(struct btree_trans *trans,
1364                                enum btree_id btree_id, unsigned level,
1365                                struct bkey_s_c k, unsigned flags)
1366 {
1367         struct bch_fs *c = trans->c;
1368         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1369         const union bch_extent_entry *entry;
1370         struct extent_ptr_decoded p;
1371         struct bch_replicas_padded r;
1372         enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
1373                 ? BCH_DATA_btree
1374                 : BCH_DATA_user;
1375         s64 sectors = bkey_is_btree_ptr(k.k)
1376                 ? btree_sectors(c)
1377                 : k.k->size;
1378         s64 dirty_sectors = 0;
1379         bool stale;
1380         int ret = 0;
1381
1382         r.e.data_type   = data_type;
1383         r.e.nr_devs     = 0;
1384         r.e.nr_required = 1;
1385
1386         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1387                 s64 disk_sectors = ptr_disk_sectors(sectors, p);
1388
1389                 if (flags & BTREE_TRIGGER_OVERWRITE)
1390                         disk_sectors = -disk_sectors;
1391
1392                 ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
1393                 if (ret < 0)
1394                         return ret;
1395
1396                 stale = ret > 0;
1397
1398                 if (p.ptr.cached) {
1399                         if (!stale) {
1400                                 ret = bch2_update_cached_sectors_list(trans, p.ptr.dev,
1401                                                                       disk_sectors);
1402                                 if (ret)
1403                                         return ret;
1404                         }
1405                 } else if (!p.has_ec) {
1406                         dirty_sectors          += disk_sectors;
1407                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1408                 } else {
1409                         ret = bch2_trans_mark_stripe_ptr(trans, p,
1410                                         disk_sectors, data_type);
1411                         if (ret)
1412                                 return ret;
1413
1414                         r.e.nr_required = 0;
1415                 }
1416         }
1417
1418         if (r.e.nr_devs)
1419                 ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors);
1420
1421         return ret;
1422 }
1423
1424 int bch2_trans_mark_extent(struct btree_trans *trans,
1425                            enum btree_id btree_id, unsigned level,
1426                            struct bkey_s_c old, struct bkey_i *new,
1427                            unsigned flags)
1428 {
1429         struct bch_fs *c = trans->c;
1430         int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
1431                   (int) bch2_bkey_needs_rebalance(c, old);
1432
1433         if (mod) {
1434                 int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
1435                 if (ret)
1436                         return ret;
1437         }
1438
1439         return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
1440 }
1441
1442 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
1443                                          struct bkey_s_c_stripe s,
1444                                          unsigned idx, bool deleting)
1445 {
1446         struct bch_fs *c = trans->c;
1447         const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
1448         struct btree_iter iter;
1449         struct bkey_i_alloc_v4 *a;
1450         enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
1451                 ? BCH_DATA_parity : 0;
1452         s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
1453         int ret = 0;
1454
1455         if (deleting)
1456                 sectors = -sectors;
1457
1458         a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
1459         if (IS_ERR(a))
1460                 return PTR_ERR(a);
1461
1462         ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
1463                                a->v.gen, a->v.data_type,
1464                                a->v.dirty_sectors);
1465         if (ret)
1466                 goto err;
1467
1468         if (!deleting) {
1469                 if (bch2_trans_inconsistent_on(a->v.stripe ||
1470                                                a->v.stripe_redundancy, trans,
1471                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
1472                                 iter.pos.inode, iter.pos.offset, a->v.gen,
1473                                 bch2_data_types[a->v.data_type],
1474                                 a->v.dirty_sectors,
1475                                 a->v.stripe, s.k->p.offset)) {
1476                         ret = -EIO;
1477                         goto err;
1478                 }
1479
1480                 if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
1481                                 "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
1482                                 iter.pos.inode, iter.pos.offset, a->v.gen,
1483                                 bch2_data_types[a->v.data_type],
1484                                 a->v.dirty_sectors,
1485                                 s.k->p.offset)) {
1486                         ret = -EIO;
1487                         goto err;
1488                 }
1489
1490                 a->v.stripe             = s.k->p.offset;
1491                 a->v.stripe_redundancy  = s.v->nr_redundant;
1492                 a->v.data_type          = BCH_DATA_stripe;
1493         } else {
1494                 if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
1495                                                a->v.stripe_redundancy != s.v->nr_redundant, trans,
1496                                 "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
1497                                 iter.pos.inode, iter.pos.offset, a->v.gen,
1498                                 s.k->p.offset, a->v.stripe)) {
1499                         ret = -EIO;
1500                         goto err;
1501                 }
1502
1503                 a->v.stripe             = 0;
1504                 a->v.stripe_redundancy  = 0;
1505                 a->v.data_type          = alloc_data_type(a->v, BCH_DATA_user);
1506         }
1507
1508         a->v.dirty_sectors += sectors;
1509         if (data_type)
1510                 a->v.data_type = !deleting ? data_type : 0;
1511
1512         ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1513         if (ret)
1514                 goto err;
1515 err:
1516         bch2_trans_iter_exit(trans, &iter);
1517         return ret;
1518 }
1519
1520 int bch2_trans_mark_stripe(struct btree_trans *trans,
1521                            enum btree_id btree_id, unsigned level,
1522                            struct bkey_s_c old, struct bkey_i *new,
1523                            unsigned flags)
1524 {
1525         const struct bch_stripe *old_s = NULL;
1526         struct bch_stripe *new_s = NULL;
1527         struct bch_replicas_padded r;
1528         unsigned i, nr_blocks;
1529         int ret = 0;
1530
1531         if (old.k->type == KEY_TYPE_stripe)
1532                 old_s = bkey_s_c_to_stripe(old).v;
1533         if (new->k.type == KEY_TYPE_stripe)
1534                 new_s = &bkey_i_to_stripe(new)->v;
1535
1536         /*
1537          * If the pointers aren't changing, we don't need to do anything:
1538          */
1539         if (new_s && old_s &&
1540             new_s->nr_blocks    == old_s->nr_blocks &&
1541             new_s->nr_redundant == old_s->nr_redundant &&
1542             !memcmp(old_s->ptrs, new_s->ptrs,
1543                     new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
1544                 return 0;
1545
1546         BUG_ON(new_s && old_s &&
1547                (new_s->nr_blocks        != old_s->nr_blocks ||
1548                 new_s->nr_redundant     != old_s->nr_redundant));
1549
1550         nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
1551
1552         if (new_s) {
1553                 s64 sectors = le16_to_cpu(new_s->sectors);
1554
1555                 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
1556                 ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
1557                 if (ret)
1558                         return ret;
1559         }
1560
1561         if (old_s) {
1562                 s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
1563
1564                 bch2_bkey_to_replicas(&r.e, old);
1565                 ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
1566                 if (ret)
1567                         return ret;
1568         }
1569
1570         for (i = 0; i < nr_blocks; i++) {
1571                 if (new_s && old_s &&
1572                     !memcmp(&new_s->ptrs[i],
1573                             &old_s->ptrs[i],
1574                             sizeof(new_s->ptrs[i])))
1575                         continue;
1576
1577                 if (new_s) {
1578                         ret = bch2_trans_mark_stripe_bucket(trans,
1579                                         bkey_i_to_s_c_stripe(new), i, false);
1580                         if (ret)
1581                                 break;
1582                 }
1583
1584                 if (old_s) {
1585                         ret = bch2_trans_mark_stripe_bucket(trans,
1586                                         bkey_s_c_to_stripe(old), i, true);
1587                         if (ret)
1588                                 break;
1589                 }
1590         }
1591
1592         return ret;
1593 }
1594
1595 static int __trans_mark_reservation(struct btree_trans *trans,
1596                                     enum btree_id btree_id, unsigned level,
1597                                     struct bkey_s_c k, unsigned flags)
1598 {
1599         unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1600         s64 sectors = (s64) k.k->size;
1601         struct replicas_delta_list *d;
1602         int ret;
1603
1604         if (flags & BTREE_TRIGGER_OVERWRITE)
1605                 sectors = -sectors;
1606         sectors *= replicas;
1607
1608         ret = bch2_replicas_deltas_realloc(trans, 0);
1609         if (ret)
1610                 return ret;
1611
1612         d = trans->fs_usage_deltas;
1613         replicas = clamp_t(unsigned, replicas, 1,
1614                            ARRAY_SIZE(d->persistent_reserved));
1615
1616         d->persistent_reserved[replicas - 1] += sectors;
1617         return 0;
1618 }
1619
1620 int bch2_trans_mark_reservation(struct btree_trans *trans,
1621                                 enum btree_id btree_id, unsigned level,
1622                                 struct bkey_s_c old,
1623                                 struct bkey_i *new,
1624                                 unsigned flags)
1625 {
1626         return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
1627 }
1628
1629 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1630                                     struct bch_dev *ca, size_t b,
1631                                     enum bch_data_type type,
1632                                     unsigned sectors)
1633 {
1634         struct bch_fs *c = trans->c;
1635         struct btree_iter iter;
1636         struct bkey_i_alloc_v4 *a;
1637         int ret = 0;
1638
1639         /*
1640          * Backup superblock might be past the end of our normal usable space:
1641          */
1642         if (b >= ca->mi.nbuckets)
1643                 return 0;
1644
1645         a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b));
1646         if (IS_ERR(a))
1647                 return PTR_ERR(a);
1648
1649         if (a->v.data_type && type && a->v.data_type != type) {
1650                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
1651                               BCH_FSCK_ERR_bucket_metadata_type_mismatch,
1652                         "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
1653                         "while marking %s",
1654                         iter.pos.inode, iter.pos.offset, a->v.gen,
1655                         bch2_data_types[a->v.data_type],
1656                         bch2_data_types[type],
1657                         bch2_data_types[type]);
1658                 ret = -EIO;
1659                 goto err;
1660         }
1661
1662         if (a->v.data_type      != type ||
1663             a->v.dirty_sectors  != sectors) {
1664                 a->v.data_type          = type;
1665                 a->v.dirty_sectors      = sectors;
1666                 ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
1667         }
1668 err:
1669         bch2_trans_iter_exit(trans, &iter);
1670         return ret;
1671 }
1672
1673 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1674                                     struct bch_dev *ca, size_t b,
1675                                     enum bch_data_type type,
1676                                     unsigned sectors)
1677 {
1678         return commit_do(trans, NULL, NULL, 0,
1679                         __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
1680 }
1681
1682 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
1683                                             struct bch_dev *ca,
1684                                             u64 start, u64 end,
1685                                             enum bch_data_type type,
1686                                             u64 *bucket, unsigned *bucket_sectors)
1687 {
1688         do {
1689                 u64 b = sector_to_bucket(ca, start);
1690                 unsigned sectors =
1691                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1692
1693                 if (b != *bucket && *bucket_sectors) {
1694                         int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
1695                                                                   type, *bucket_sectors);
1696                         if (ret)
1697                                 return ret;
1698
1699                         *bucket_sectors = 0;
1700                 }
1701
1702                 *bucket         = b;
1703                 *bucket_sectors += sectors;
1704                 start += sectors;
1705         } while (start < end);
1706
1707         return 0;
1708 }
1709
1710 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
1711                                     struct bch_dev *ca)
1712 {
1713         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1714         u64 bucket = 0;
1715         unsigned i, bucket_sectors = 0;
1716         int ret;
1717
1718         for (i = 0; i < layout->nr_superblocks; i++) {
1719                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
1720
1721                 if (offset == BCH_SB_SECTOR) {
1722                         ret = bch2_trans_mark_metadata_sectors(trans, ca,
1723                                                 0, BCH_SB_SECTOR,
1724                                                 BCH_DATA_sb, &bucket, &bucket_sectors);
1725                         if (ret)
1726                                 return ret;
1727                 }
1728
1729                 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
1730                                       offset + (1 << layout->sb_max_size_bits),
1731                                       BCH_DATA_sb, &bucket, &bucket_sectors);
1732                 if (ret)
1733                         return ret;
1734         }
1735
1736         if (bucket_sectors) {
1737                 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1738                                 bucket, BCH_DATA_sb, bucket_sectors);
1739                 if (ret)
1740                         return ret;
1741         }
1742
1743         for (i = 0; i < ca->journal.nr; i++) {
1744                 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1745                                 ca->journal.buckets[i],
1746                                 BCH_DATA_journal, ca->mi.bucket_size);
1747                 if (ret)
1748                         return ret;
1749         }
1750
1751         return 0;
1752 }
1753
1754 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
1755 {
1756         int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
1757
1758         bch_err_fn(c, ret);
1759         return ret;
1760 }
1761
1762 int bch2_trans_mark_dev_sbs(struct bch_fs *c)
1763 {
1764         for_each_online_member(c, ca) {
1765                 int ret = bch2_trans_mark_dev_sb(c, ca);
1766                 if (ret) {
1767                         percpu_ref_put(&ca->ref);
1768                         return ret;
1769                 }
1770         }
1771
1772         return 0;
1773 }
1774
1775 /* Disk reservations: */
1776
1777 #define SECTORS_CACHE   1024
1778
1779 int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1780                               u64 sectors, int flags)
1781 {
1782         struct bch_fs_pcpu *pcpu;
1783         u64 old, v, get;
1784         s64 sectors_available;
1785         int ret;
1786
1787         percpu_down_read(&c->mark_lock);
1788         preempt_disable();
1789         pcpu = this_cpu_ptr(c->pcpu);
1790
1791         if (sectors <= pcpu->sectors_available)
1792                 goto out;
1793
1794         v = atomic64_read(&c->sectors_available);
1795         do {
1796                 old = v;
1797                 get = min((u64) sectors + SECTORS_CACHE, old);
1798
1799                 if (get < sectors) {
1800                         preempt_enable();
1801                         goto recalculate;
1802                 }
1803         } while ((v = atomic64_cmpxchg(&c->sectors_available,
1804                                        old, old - get)) != old);
1805
1806         pcpu->sectors_available         += get;
1807
1808 out:
1809         pcpu->sectors_available         -= sectors;
1810         this_cpu_add(*c->online_reserved, sectors);
1811         res->sectors                    += sectors;
1812
1813         preempt_enable();
1814         percpu_up_read(&c->mark_lock);
1815         return 0;
1816
1817 recalculate:
1818         mutex_lock(&c->sectors_available_lock);
1819
1820         percpu_u64_set(&c->pcpu->sectors_available, 0);
1821         sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
1822
1823         if (sectors <= sectors_available ||
1824             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1825                 atomic64_set(&c->sectors_available,
1826                              max_t(s64, 0, sectors_available - sectors));
1827                 this_cpu_add(*c->online_reserved, sectors);
1828                 res->sectors                    += sectors;
1829                 ret = 0;
1830         } else {
1831                 atomic64_set(&c->sectors_available, sectors_available);
1832                 ret = -BCH_ERR_ENOSPC_disk_reservation;
1833         }
1834
1835         mutex_unlock(&c->sectors_available_lock);
1836         percpu_up_read(&c->mark_lock);
1837
1838         return ret;
1839 }
1840
1841 /* Startup/shutdown: */
1842
1843 static void bucket_gens_free_rcu(struct rcu_head *rcu)
1844 {
1845         struct bucket_gens *buckets =
1846                 container_of(rcu, struct bucket_gens, rcu);
1847
1848         kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
1849 }
1850
1851 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1852 {
1853         struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
1854         unsigned long *buckets_nouse = NULL;
1855         bool resize = ca->bucket_gens != NULL;
1856         int ret;
1857
1858         if (!(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
1859                                             GFP_KERNEL|__GFP_ZERO))) {
1860                 ret = -BCH_ERR_ENOMEM_bucket_gens;
1861                 goto err;
1862         }
1863
1864         if ((c->opts.buckets_nouse &&
1865              !(buckets_nouse    = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1866                                             sizeof(unsigned long),
1867                                             GFP_KERNEL|__GFP_ZERO)))) {
1868                 ret = -BCH_ERR_ENOMEM_buckets_nouse;
1869                 goto err;
1870         }
1871
1872         bucket_gens->first_bucket = ca->mi.first_bucket;
1873         bucket_gens->nbuckets   = nbuckets;
1874
1875         if (resize) {
1876                 down_write(&c->gc_lock);
1877                 down_write(&ca->bucket_lock);
1878                 percpu_down_write(&c->mark_lock);
1879         }
1880
1881         old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
1882
1883         if (resize) {
1884                 size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
1885
1886                 memcpy(bucket_gens->b,
1887                        old_bucket_gens->b,
1888                        n);
1889                 if (buckets_nouse)
1890                         memcpy(buckets_nouse,
1891                                ca->buckets_nouse,
1892                                BITS_TO_LONGS(n) * sizeof(unsigned long));
1893         }
1894
1895         rcu_assign_pointer(ca->bucket_gens, bucket_gens);
1896         bucket_gens     = old_bucket_gens;
1897
1898         swap(ca->buckets_nouse, buckets_nouse);
1899
1900         nbuckets = ca->mi.nbuckets;
1901
1902         if (resize) {
1903                 percpu_up_write(&c->mark_lock);
1904                 up_write(&ca->bucket_lock);
1905                 up_write(&c->gc_lock);
1906         }
1907
1908         ret = 0;
1909 err:
1910         kvpfree(buckets_nouse,
1911                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1912         if (bucket_gens)
1913                 call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
1914
1915         return ret;
1916 }
1917
1918 void bch2_dev_buckets_free(struct bch_dev *ca)
1919 {
1920         unsigned i;
1921
1922         kvpfree(ca->buckets_nouse,
1923                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1924         kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
1925                 sizeof(struct bucket_gens) + ca->mi.nbuckets);
1926
1927         for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
1928                 free_percpu(ca->usage[i]);
1929         kfree(ca->usage_base);
1930 }
1931
1932 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1933 {
1934         unsigned i;
1935
1936         ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
1937         if (!ca->usage_base)
1938                 return -BCH_ERR_ENOMEM_usage_init;
1939
1940         for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
1941                 ca->usage[i] = alloc_percpu(struct bch_dev_usage);
1942                 if (!ca->usage[i])
1943                         return -BCH_ERR_ENOMEM_usage_init;
1944         }
1945
1946         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
1947 }