]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/buckets.c
Update bcachefs sources to ef60854e99 bcachefs: More allocator startup improvements
[bcachefs-tools-debian] / libbcachefs / buckets.c
1 /*
2  * Code for manipulating bucket marks for garbage collection.
3  *
4  * Copyright 2014 Datera, Inc.
5  *
6  * Bucket states:
7  * - free bucket: mark == 0
8  *   The bucket contains no data and will not be read
9  *
10  * - allocator bucket: owned_by_allocator == 1
11  *   The bucket is on a free list, or it is an open bucket
12  *
13  * - cached bucket: owned_by_allocator == 0 &&
14  *                  dirty_sectors == 0 &&
15  *                  cached_sectors > 0
16  *   The bucket contains data but may be safely discarded as there are
17  *   enough replicas of the data on other cache devices, or it has been
18  *   written back to the backing device
19  *
20  * - dirty bucket: owned_by_allocator == 0 &&
21  *                 dirty_sectors > 0
22  *   The bucket contains data that we must not discard (either only copy,
23  *   or one of the 'main copies' for data requiring multiple replicas)
24  *
25  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
26  *   This is a btree node, journal or gen/prio bucket
27  *
28  * Lifecycle:
29  *
30  * bucket invalidated => bucket on freelist => open bucket =>
31  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
32  *
33  * Note that cache promotion can skip the dirty bucket step, as data
34  * is copied from a deeper tier to a shallower tier, onto a cached
35  * bucket.
36  * Note also that a cached bucket can spontaneously become dirty --
37  * see below.
38  *
39  * Only a traversal of the key space can determine whether a bucket is
40  * truly dirty or cached.
41  *
42  * Transitions:
43  *
44  * - free => allocator: bucket was invalidated
45  * - cached => allocator: bucket was invalidated
46  *
47  * - allocator => dirty: open bucket was filled up
48  * - allocator => cached: open bucket was filled up
49  * - allocator => metadata: metadata was allocated
50  *
51  * - dirty => cached: dirty sectors were copied to a deeper tier
52  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
53  * - cached => free: cached sectors were overwritten
54  *
55  * - metadata => free: metadata was freed
56  *
57  * Oddities:
58  * - cached => dirty: a device was removed so formerly replicated data
59  *                    is no longer sufficiently replicated
60  * - free => cached: cannot happen
61  * - free => dirty: cannot happen
62  * - free => metadata: cannot happen
63  */
64
65 #include "bcachefs.h"
66 #include "alloc_background.h"
67 #include "bset.h"
68 #include "btree_gc.h"
69 #include "btree_update.h"
70 #include "buckets.h"
71 #include "ec.h"
72 #include "error.h"
73 #include "movinggc.h"
74 #include "replicas.h"
75
76 #include <linux/preempt.h>
77 #include <trace/events/bcachefs.h>
78
79 /*
80  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
81  * wraparound:
82  */
83 void bch2_bucket_seq_cleanup(struct bch_fs *c)
84 {
85         u64 journal_seq = atomic64_read(&c->journal.seq);
86         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
87         struct bch_dev *ca;
88         struct bucket_array *buckets;
89         struct bucket *g;
90         struct bucket_mark m;
91         unsigned i;
92
93         if (journal_seq - c->last_bucket_seq_cleanup <
94             (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
95                 return;
96
97         c->last_bucket_seq_cleanup = journal_seq;
98
99         for_each_member_device(ca, c, i) {
100                 down_read(&ca->bucket_lock);
101                 buckets = bucket_array(ca);
102
103                 for_each_bucket(g, buckets) {
104                         bucket_cmpxchg(g, m, ({
105                                 if (!m.journal_seq_valid ||
106                                     bucket_needs_journal_commit(m, last_seq_ondisk))
107                                         break;
108
109                                 m.journal_seq_valid = 0;
110                         }));
111                 }
112                 up_read(&ca->bucket_lock);
113         }
114 }
115
116 #define bch2_usage_read_raw(_stats)                                     \
117 ({                                                                      \
118         typeof(*this_cpu_ptr(_stats)) _acc;                             \
119                                                                         \
120         memset(&_acc, 0, sizeof(_acc));                                 \
121         acc_u64s_percpu((u64 *) &_acc,                                  \
122                         (u64 __percpu *) _stats,                        \
123                         sizeof(_acc) / sizeof(u64));                    \
124                                                                         \
125         _acc;                                                           \
126 })
127
128 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
129 {
130         return bch2_usage_read_raw(ca->usage[0]);
131 }
132
133 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
134 {
135         struct bch_fs_usage *ret;
136         unsigned nr = READ_ONCE(c->replicas.nr);
137 retry:
138         ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
139         if (unlikely(!ret))
140                 return NULL;
141
142         percpu_down_read_preempt_disable(&c->mark_lock);
143
144         if (unlikely(nr < c->replicas.nr)) {
145                 nr = c->replicas.nr;
146                 percpu_up_read_preempt_enable(&c->mark_lock);
147                 kfree(ret);
148                 goto retry;
149         }
150
151         acc_u64s_percpu((u64 *) ret,
152                         (u64 __percpu *) c->usage[0],
153                         sizeof(*ret) / sizeof(u64) + nr);
154         percpu_up_read_preempt_enable(&c->mark_lock);
155
156         return ret;
157 }
158
159 #define RESERVE_FACTOR  6
160
161 static u64 reserve_factor(u64 r)
162 {
163         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
164 }
165
166 static u64 avail_factor(u64 r)
167 {
168         return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
169 }
170
171 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
172 {
173         return min(fs_usage.s.hidden +
174                    fs_usage.s.data +
175                    reserve_factor(fs_usage.s.reserved +
176                                   fs_usage.s.online_reserved),
177                    c->capacity);
178 }
179
180 struct bch_fs_usage_short
181 bch2_fs_usage_read_short(struct bch_fs *c)
182 {
183         struct bch_fs_usage_summarized usage =
184                 bch2_usage_read_raw(&c->usage[0]->s);
185         struct bch_fs_usage_short ret;
186
187         ret.capacity    = READ_ONCE(c->capacity) - usage.hidden;
188         ret.used        = min(ret.capacity, usage.data +
189                               reserve_factor(usage.reserved +
190                                              usage.online_reserved));
191         ret.nr_inodes   = usage.nr_inodes;
192
193         return ret;
194 }
195
196 static inline int is_unavailable_bucket(struct bucket_mark m)
197 {
198         return !is_available_bucket(m);
199 }
200
201 static inline int is_fragmented_bucket(struct bucket_mark m,
202                                        struct bch_dev *ca)
203 {
204         if (!m.owned_by_allocator &&
205             m.data_type == BCH_DATA_USER &&
206             bucket_sectors_used(m))
207                 return max_t(int, 0, (int) ca->mi.bucket_size -
208                              bucket_sectors_used(m));
209         return 0;
210 }
211
212 static inline enum bch_data_type bucket_type(struct bucket_mark m)
213 {
214         return m.cached_sectors && !m.dirty_sectors
215                 ? BCH_DATA_CACHED
216                 : m.data_type;
217 }
218
219 static bool bucket_became_unavailable(struct bucket_mark old,
220                                       struct bucket_mark new)
221 {
222         return is_available_bucket(old) &&
223                !is_available_bucket(new);
224 }
225
226 void bch2_fs_usage_apply(struct bch_fs *c,
227                          struct bch_fs_usage *fs_usage,
228                          struct disk_reservation *disk_res,
229                          struct gc_pos gc_pos)
230 {
231         s64 added = fs_usage->s.data + fs_usage->s.reserved;
232         s64 should_not_have_added;
233
234         percpu_rwsem_assert_held(&c->mark_lock);
235
236         /*
237          * Not allowed to reduce sectors_available except by getting a
238          * reservation:
239          */
240         should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
241         if (WARN_ONCE(should_not_have_added > 0,
242                       "disk usage increased without a reservation")) {
243                 atomic64_sub(should_not_have_added, &c->sectors_available);
244                 added -= should_not_have_added;
245         }
246
247         if (added > 0) {
248                 disk_res->sectors               -= added;
249                 fs_usage->s.online_reserved     -= added;
250         }
251
252         acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
253                  (u64 *) fs_usage,
254                  sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
255
256         if (gc_visited(c, gc_pos)) {
257                 BUG_ON(!c->usage[1]);
258                 acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
259                          (u64 *) fs_usage,
260                          sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
261         }
262 }
263
264 static inline void account_bucket(struct bch_fs_usage *fs_usage,
265                                   struct bch_dev_usage *dev_usage,
266                                   enum bch_data_type type,
267                                   int nr, s64 size)
268 {
269         if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
270                 fs_usage->s.hidden      += size;
271
272         dev_usage->buckets[type]        += nr;
273 }
274
275 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
276                                   struct bch_fs_usage *fs_usage,
277                                   struct bucket_mark old, struct bucket_mark new,
278                                   bool gc)
279 {
280         struct bch_dev_usage *dev_usage;
281
282         percpu_rwsem_assert_held(&c->mark_lock);
283
284         bch2_fs_inconsistent_on(old.data_type && new.data_type &&
285                                 old.data_type != new.data_type, c,
286                 "different types of data in same bucket: %s, %s",
287                 bch2_data_types[old.data_type],
288                 bch2_data_types[new.data_type]);
289
290         dev_usage = this_cpu_ptr(ca->usage[gc]);
291
292         if (bucket_type(old))
293                 account_bucket(fs_usage, dev_usage, bucket_type(old),
294                                -1, -ca->mi.bucket_size);
295
296         if (bucket_type(new))
297                 account_bucket(fs_usage, dev_usage, bucket_type(new),
298                                1, ca->mi.bucket_size);
299
300         dev_usage->buckets_alloc +=
301                 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
302         dev_usage->buckets_ec +=
303                 (int) new.stripe - (int) old.stripe;
304         dev_usage->buckets_unavailable +=
305                 is_unavailable_bucket(new) - is_unavailable_bucket(old);
306
307         dev_usage->sectors[old.data_type] -= old.dirty_sectors;
308         dev_usage->sectors[new.data_type] += new.dirty_sectors;
309         dev_usage->sectors[BCH_DATA_CACHED] +=
310                 (int) new.cached_sectors - (int) old.cached_sectors;
311         dev_usage->sectors_fragmented +=
312                 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
313
314         if (!is_available_bucket(old) && is_available_bucket(new))
315                 bch2_wake_allocator(ca);
316 }
317
318 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
319 {
320         struct bucket_mark old = { .v.counter = 0 };
321         struct bch_fs_usage *fs_usage;
322         struct bucket_array *buckets;
323         struct bucket *g;
324
325         percpu_down_read_preempt_disable(&c->mark_lock);
326         fs_usage = this_cpu_ptr(c->usage[0]);
327         buckets = bucket_array(ca);
328
329         for_each_bucket(g, buckets)
330                 if (g->mark.data_type)
331                         bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
332         percpu_up_read_preempt_enable(&c->mark_lock);
333 }
334
335 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr)      \
336 ({                                                              \
337         struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
338                                                                 \
339         bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc);  \
340         _old;                                                   \
341 })
342
343 static inline void update_replicas(struct bch_fs *c,
344                                    struct bch_fs_usage *fs_usage,
345                                    struct bch_replicas_entry *r,
346                                    s64 sectors)
347 {
348         int idx = bch2_replicas_entry_idx(c, r);
349
350         BUG_ON(idx < 0);
351         BUG_ON(!sectors);
352
353         if (r->data_type == BCH_DATA_CACHED)
354                 fs_usage->s.cached      += sectors;
355         else
356                 fs_usage->s.data        += sectors;
357         fs_usage->data[idx]             += sectors;
358 }
359
360 static inline void update_cached_sectors(struct bch_fs *c,
361                                          struct bch_fs_usage *fs_usage,
362                                          unsigned dev, s64 sectors)
363 {
364         struct bch_replicas_padded r;
365
366         r.e.data_type   = BCH_DATA_CACHED;
367         r.e.nr_devs     = 1;
368         r.e.nr_required = 1;
369         r.e.devs[0]     = dev;
370
371         update_replicas(c, fs_usage, &r.e, sectors);
372 }
373
374 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
375                                      size_t b, struct bucket_mark *old,
376                                      bool gc)
377 {
378         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
379         struct bucket *g = __bucket(ca, b, gc);
380         struct bucket_mark new;
381
382         *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
383                 BUG_ON(!is_available_bucket(new));
384
385                 new.owned_by_allocator  = 1;
386                 new.data_type           = 0;
387                 new.cached_sectors      = 0;
388                 new.dirty_sectors       = 0;
389                 new.gen++;
390         }));
391
392         if (old->cached_sectors)
393                 update_cached_sectors(c, fs_usage, ca->dev_idx,
394                                       -old->cached_sectors);
395 }
396
397 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
398                             size_t b, struct bucket_mark *old)
399 {
400         percpu_rwsem_assert_held(&c->mark_lock);
401
402         __bch2_invalidate_bucket(c, ca, b, old, false);
403
404         if (!old->owned_by_allocator && old->cached_sectors)
405                 trace_invalidate(ca, bucket_to_sector(ca, b),
406                                  old->cached_sectors);
407 }
408
409 static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
410                                      size_t b, bool owned_by_allocator,
411                                      bool gc)
412 {
413         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
414         struct bucket *g = __bucket(ca, b, gc);
415         struct bucket_mark old, new;
416
417         old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
418                 new.owned_by_allocator  = owned_by_allocator;
419         }));
420
421         BUG_ON(!gc &&
422                !owned_by_allocator && !old.owned_by_allocator);
423 }
424
425 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
426                             size_t b, bool owned_by_allocator,
427                             struct gc_pos pos, unsigned flags)
428 {
429         percpu_rwsem_assert_held(&c->mark_lock);
430
431         if (!(flags & BCH_BUCKET_MARK_GC))
432                 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
433
434         if ((flags & BCH_BUCKET_MARK_GC) ||
435             gc_visited(c, pos))
436                 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
437 }
438
439 #define checked_add(a, b)                                       \
440 do {                                                            \
441         unsigned _res = (unsigned) (a) + (b);                   \
442         (a) = _res;                                             \
443         BUG_ON((a) != _res);                                    \
444 } while (0)
445
446 static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
447                                         size_t b, enum bch_data_type type,
448                                         unsigned sectors, bool gc)
449 {
450         struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
451         struct bucket *g = __bucket(ca, b, gc);
452         struct bucket_mark new;
453
454         BUG_ON(type != BCH_DATA_SB &&
455                type != BCH_DATA_JOURNAL);
456
457         bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
458                 new.data_type   = type;
459                 checked_add(new.dirty_sectors, sectors);
460         }));
461 }
462
463 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
464                                size_t b, enum bch_data_type type,
465                                unsigned sectors, struct gc_pos pos,
466                                unsigned flags)
467 {
468         BUG_ON(type != BCH_DATA_SB &&
469                type != BCH_DATA_JOURNAL);
470
471         if (likely(c)) {
472                 percpu_rwsem_assert_held(&c->mark_lock);
473
474                 if (!(flags & BCH_BUCKET_MARK_GC))
475                         __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
476                                                     false);
477                 if ((flags & BCH_BUCKET_MARK_GC) ||
478                     gc_visited(c, pos))
479                         __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
480                                                     true);
481         } else {
482                 struct bucket *g;
483                 struct bucket_mark old, new;
484
485                 rcu_read_lock();
486
487                 g = bucket(ca, b);
488                 old = bucket_cmpxchg(g, new, ({
489                         new.data_type = type;
490                         checked_add(new.dirty_sectors, sectors);
491                 }));
492
493                 rcu_read_unlock();
494         }
495 }
496
497 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
498                                   s64 delta)
499 {
500         if (delta > 0) {
501                 /*
502                  * marking a new extent, which _will have size_ @delta
503                  *
504                  * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
505                  * case, we haven't actually created the key we'll be inserting
506                  * yet (for the split) - so we don't want to be using
507                  * k->size/crc.live_size here:
508                  */
509                 return __ptr_disk_sectors(p, delta);
510         } else {
511                 BUG_ON(-delta > p.crc.live_size);
512
513                 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
514                         (s64) ptr_disk_sectors(p);
515         }
516 }
517
518 /*
519  * Checking against gc's position has to be done here, inside the cmpxchg()
520  * loop, to avoid racing with the start of gc clearing all the marks - GC does
521  * that with the gc pos seqlock held.
522  */
523 static void bch2_mark_pointer(struct bch_fs *c,
524                               struct extent_ptr_decoded p,
525                               s64 sectors, enum bch_data_type data_type,
526                               struct bch_fs_usage *fs_usage,
527                               unsigned journal_seq, unsigned flags,
528                               bool gc)
529 {
530         struct bucket_mark old, new;
531         struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
532         size_t b = PTR_BUCKET_NR(ca, &p.ptr);
533         struct bucket *g = __bucket(ca, b, gc);
534         u64 v;
535
536         v = atomic64_read(&g->_mark.v);
537         do {
538                 new.v.counter = old.v.counter = v;
539
540                 /*
541                  * Check this after reading bucket mark to guard against
542                  * the allocator invalidating a bucket after we've already
543                  * checked the gen
544                  */
545                 if (gen_after(new.gen, p.ptr.gen)) {
546                         BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
547                         EBUG_ON(!p.ptr.cached &&
548                                 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
549                         return;
550                 }
551
552                 if (!p.ptr.cached)
553                         checked_add(new.dirty_sectors, sectors);
554                 else
555                         checked_add(new.cached_sectors, sectors);
556
557                 if (!new.dirty_sectors &&
558                     !new.cached_sectors) {
559                         new.data_type   = 0;
560
561                         if (journal_seq) {
562                                 new.journal_seq_valid = 1;
563                                 new.journal_seq = journal_seq;
564                         }
565                 } else {
566                         new.data_type = data_type;
567                 }
568
569                 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
570                         g->_mark = new;
571                         break;
572                 }
573         } while ((v = atomic64_cmpxchg(&g->_mark.v,
574                               old.v.counter,
575                               new.v.counter)) != old.v.counter);
576
577         bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
578
579         BUG_ON(!gc && bucket_became_unavailable(old, new));
580 }
581
582 static int bch2_mark_stripe_ptr(struct bch_fs *c,
583                                 struct bch_extent_stripe_ptr p,
584                                 enum bch_data_type data_type,
585                                 struct bch_fs_usage *fs_usage,
586                                 s64 sectors, unsigned flags,
587                                 bool gc)
588 {
589         struct stripe *m;
590         unsigned old, new, nr_data;
591         int blocks_nonempty_delta;
592         s64 parity_sectors;
593
594         m = genradix_ptr(&c->stripes[gc], p.idx);
595
596         if (!m || !m->alive) {
597                 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
598                                     (u64) p.idx);
599                 return -1;
600         }
601
602         BUG_ON(m->r.e.data_type != data_type);
603
604         nr_data = m->nr_blocks - m->nr_redundant;
605
606         parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
607
608         if (sectors < 0)
609                 parity_sectors = -parity_sectors;
610         sectors += parity_sectors;
611
612         new = atomic_add_return(sectors, &m->block_sectors[p.block]);
613         old = new - sectors;
614
615         blocks_nonempty_delta = (int) !!new - (int) !!old;
616         if (!blocks_nonempty_delta)
617                 return 0;
618
619         atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
620
621         BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
622
623         if (!gc)
624                 bch2_stripes_heap_update(c, m, p.idx);
625
626         update_replicas(c, fs_usage, &m->r.e, sectors);
627
628         return 0;
629 }
630
631 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
632                             s64 sectors,
633                             enum bch_data_type data_type,
634                             struct bch_fs_usage *fs_usage,
635                             unsigned journal_seq, unsigned flags,
636                             bool gc)
637 {
638         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
639         const union bch_extent_entry *entry;
640         struct extent_ptr_decoded p;
641         struct bch_replicas_padded r;
642         s64 dirty_sectors = 0;
643         unsigned i;
644         int ret;
645
646         r.e.data_type   = data_type;
647         r.e.nr_devs     = 0;
648         r.e.nr_required = 1;
649
650         BUG_ON(!sectors);
651
652         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
653                 s64 disk_sectors = data_type == BCH_DATA_BTREE
654                         ? sectors
655                         : ptr_disk_sectors_delta(p, sectors);
656
657                 bch2_mark_pointer(c, p, disk_sectors, data_type,
658                                   fs_usage, journal_seq, flags, gc);
659
660                 if (p.ptr.cached) {
661                         update_cached_sectors(c, fs_usage, p.ptr.dev,
662                                               disk_sectors);
663                 } else if (!p.ec_nr) {
664                         dirty_sectors          += disk_sectors;
665                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
666                 } else {
667                         for (i = 0; i < p.ec_nr; i++) {
668                                 ret = bch2_mark_stripe_ptr(c, p.ec[i],
669                                                 data_type, fs_usage,
670                                                 disk_sectors, flags, gc);
671                                 if (ret)
672                                         return ret;
673                         }
674
675                         r.e.nr_required = 0;
676                 }
677         }
678
679         if (dirty_sectors)
680                 update_replicas(c, fs_usage, &r.e, dirty_sectors);
681
682         return 0;
683 }
684
685 static void bucket_set_stripe(struct bch_fs *c,
686                               const struct bch_stripe *v,
687                               bool enabled,
688                               struct bch_fs_usage *fs_usage,
689                               u64 journal_seq,
690                               bool gc)
691 {
692         unsigned i;
693
694         for (i = 0; i < v->nr_blocks; i++) {
695                 const struct bch_extent_ptr *ptr = v->ptrs + i;
696                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
697                 size_t b = PTR_BUCKET_NR(ca, ptr);
698                 struct bucket *g = __bucket(ca, b, gc);
699                 struct bucket_mark new, old;
700
701                 BUG_ON(ptr_stale(ca, ptr));
702
703                 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
704                         new.stripe                      = enabled;
705                         if (journal_seq) {
706                                 new.journal_seq_valid   = 1;
707                                 new.journal_seq         = journal_seq;
708                         }
709                 }));
710
711                 BUG_ON(old.stripe == enabled);
712         }
713 }
714
715 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
716                             bool inserting,
717                             struct bch_fs_usage *fs_usage,
718                             u64 journal_seq, unsigned flags,
719                             bool gc)
720 {
721         struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
722         size_t idx = s.k->p.offset;
723         struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
724         unsigned i;
725
726         if (!m || (!inserting && !m->alive)) {
727                 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
728                                     idx);
729                 return -1;
730         }
731
732         if (inserting && m->alive) {
733                 bch_err_ratelimited(c, "error marking stripe %zu: already exists",
734                                     idx);
735                 return -1;
736         }
737
738         BUG_ON(atomic_read(&m->blocks_nonempty));
739
740         for (i = 0; i < EC_STRIPE_MAX; i++)
741                 BUG_ON(atomic_read(&m->block_sectors[i]));
742
743         if (inserting) {
744                 m->sectors      = le16_to_cpu(s.v->sectors);
745                 m->algorithm    = s.v->algorithm;
746                 m->nr_blocks    = s.v->nr_blocks;
747                 m->nr_redundant = s.v->nr_redundant;
748
749                 memset(&m->r, 0, sizeof(m->r));
750
751                 m->r.e.data_type        = BCH_DATA_USER;
752                 m->r.e.nr_devs          = s.v->nr_blocks;
753                 m->r.e.nr_required      = s.v->nr_blocks - s.v->nr_redundant;
754
755                 for (i = 0; i < s.v->nr_blocks; i++)
756                         m->r.e.devs[i] = s.v->ptrs[i].dev;
757         }
758
759         /*
760          * XXX: account for stripes somehow here
761          */
762 #if 0
763         update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
764 #endif
765
766         if (!gc) {
767                 if (inserting)
768                         bch2_stripes_heap_insert(c, m, idx);
769                 else
770                         bch2_stripes_heap_del(c, m, idx);
771         } else {
772                 m->alive = inserting;
773         }
774
775         bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
776         return 0;
777 }
778
779 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
780                            bool inserting, s64 sectors,
781                            struct bch_fs_usage *fs_usage,
782                            unsigned journal_seq, unsigned flags,
783                            bool gc)
784 {
785         int ret = 0;
786
787         switch (k.k->type) {
788         case KEY_TYPE_btree_ptr:
789                 ret = bch2_mark_extent(c, k, inserting
790                                        ?  c->opts.btree_node_size
791                                        : -c->opts.btree_node_size,
792                                        BCH_DATA_BTREE,
793                                        fs_usage, journal_seq, flags, gc);
794                 break;
795         case KEY_TYPE_extent:
796                 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
797                                        fs_usage, journal_seq, flags, gc);
798                 break;
799         case KEY_TYPE_stripe:
800                 ret = bch2_mark_stripe(c, k, inserting,
801                                        fs_usage, journal_seq, flags, gc);
802                 break;
803         case KEY_TYPE_alloc:
804                 if (inserting)
805                         fs_usage->s.nr_inodes++;
806                 else
807                         fs_usage->s.nr_inodes--;
808                 break;
809         case KEY_TYPE_reservation: {
810                 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
811
812                 sectors *= replicas;
813                 replicas = clamp_t(unsigned, replicas, 1,
814                                    ARRAY_SIZE(fs_usage->persistent_reserved));
815
816                 fs_usage->s.reserved                            += sectors;
817                 fs_usage->persistent_reserved[replicas - 1]     += sectors;
818                 break;
819         }
820         default:
821                 break;
822         }
823
824         return ret;
825 }
826
827 int bch2_mark_key_locked(struct bch_fs *c,
828                    struct bkey_s_c k,
829                    bool inserting, s64 sectors,
830                    struct gc_pos pos,
831                    struct bch_fs_usage *fs_usage,
832                    u64 journal_seq, unsigned flags)
833 {
834         int ret;
835
836         if (!(flags & BCH_BUCKET_MARK_GC)) {
837                 ret = __bch2_mark_key(c, k, inserting, sectors,
838                                       fs_usage ?: this_cpu_ptr(c->usage[0]),
839                                       journal_seq, flags, false);
840                 if (ret)
841                         return ret;
842         }
843
844         if ((flags & BCH_BUCKET_MARK_GC) ||
845             gc_visited(c, pos)) {
846                 ret = __bch2_mark_key(c, k, inserting, sectors,
847                                       this_cpu_ptr(c->usage[1]),
848                                       journal_seq, flags, true);
849                 if (ret)
850                         return ret;
851         }
852
853         return 0;
854 }
855
856 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
857                   bool inserting, s64 sectors,
858                   struct gc_pos pos,
859                   struct bch_fs_usage *fs_usage,
860                   u64 journal_seq, unsigned flags)
861 {
862         int ret;
863
864         percpu_down_read_preempt_disable(&c->mark_lock);
865         ret = bch2_mark_key_locked(c, k, inserting, sectors,
866                                    pos, fs_usage, journal_seq, flags);
867         percpu_up_read_preempt_enable(&c->mark_lock);
868
869         return ret;
870 }
871
872 void bch2_mark_update(struct btree_insert *trans,
873                       struct btree_insert_entry *insert)
874 {
875         struct bch_fs           *c = trans->c;
876         struct btree_iter       *iter = insert->iter;
877         struct btree            *b = iter->l[0].b;
878         struct btree_node_iter  node_iter = iter->l[0].iter;
879         struct bch_fs_usage     *fs_usage;
880         struct gc_pos           pos = gc_pos_btree_node(b);
881         struct bkey_packed      *_k;
882
883         if (!btree_node_type_needs_gc(iter->btree_id))
884                 return;
885
886         percpu_down_read_preempt_disable(&c->mark_lock);
887         fs_usage = bch2_fs_usage_get_scratch(c);
888
889         if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
890                 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
891                         bpos_min(insert->k->k.p, b->key.k.p).offset -
892                         bkey_start_offset(&insert->k->k),
893                         pos, fs_usage, trans->journal_res.seq, 0);
894
895         while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
896                                                       KEY_TYPE_discard))) {
897                 struct bkey             unpacked;
898                 struct bkey_s_c         k;
899                 s64                     sectors = 0;
900
901                 k = bkey_disassemble(b, _k, &unpacked);
902
903                 if (btree_node_is_extents(b)
904                     ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
905                     : bkey_cmp(insert->k->k.p, k.k->p))
906                         break;
907
908                 if (btree_node_is_extents(b)) {
909                         switch (bch2_extent_overlap(&insert->k->k, k.k)) {
910                         case BCH_EXTENT_OVERLAP_ALL:
911                                 sectors = -((s64) k.k->size);
912                                 break;
913                         case BCH_EXTENT_OVERLAP_BACK:
914                                 sectors = bkey_start_offset(&insert->k->k) -
915                                         k.k->p.offset;
916                                 break;
917                         case BCH_EXTENT_OVERLAP_FRONT:
918                                 sectors = bkey_start_offset(k.k) -
919                                         insert->k->k.p.offset;
920                                 break;
921                         case BCH_EXTENT_OVERLAP_MIDDLE:
922                                 sectors = k.k->p.offset - insert->k->k.p.offset;
923                                 BUG_ON(sectors <= 0);
924
925                                 bch2_mark_key_locked(c, k, true, sectors,
926                                         pos, fs_usage, trans->journal_res.seq, 0);
927
928                                 sectors = bkey_start_offset(&insert->k->k) -
929                                         k.k->p.offset;
930                                 break;
931                         }
932
933                         BUG_ON(sectors >= 0);
934                 }
935
936                 bch2_mark_key_locked(c, k, false, sectors,
937                         pos, fs_usage, trans->journal_res.seq, 0);
938
939                 bch2_btree_node_iter_advance(&node_iter, b);
940         }
941
942         bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
943
944         percpu_up_read_preempt_enable(&c->mark_lock);
945 }
946
947 /* Disk reservations: */
948
949 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
950 {
951         int cpu;
952
953         for_each_possible_cpu(cpu)
954                 per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
955
956         return avail_factor(bch2_fs_sectors_free(c));
957 }
958
959 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
960 {
961         percpu_down_read_preempt_disable(&c->mark_lock);
962         this_cpu_sub(c->usage[0]->s.online_reserved,
963                      res->sectors);
964         percpu_up_read_preempt_enable(&c->mark_lock);
965
966         res->sectors = 0;
967 }
968
969 #define SECTORS_CACHE   1024
970
971 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
972                               unsigned sectors, int flags)
973 {
974         struct bch_fs_pcpu *pcpu;
975         u64 old, v, get;
976         s64 sectors_available;
977         int ret;
978
979         percpu_down_read_preempt_disable(&c->mark_lock);
980         pcpu = this_cpu_ptr(c->pcpu);
981
982         if (sectors <= pcpu->sectors_available)
983                 goto out;
984
985         v = atomic64_read(&c->sectors_available);
986         do {
987                 old = v;
988                 get = min((u64) sectors + SECTORS_CACHE, old);
989
990                 if (get < sectors) {
991                         percpu_up_read_preempt_enable(&c->mark_lock);
992                         goto recalculate;
993                 }
994         } while ((v = atomic64_cmpxchg(&c->sectors_available,
995                                        old, old - get)) != old);
996
997         pcpu->sectors_available         += get;
998
999 out:
1000         pcpu->sectors_available         -= sectors;
1001         this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1002         res->sectors                    += sectors;
1003
1004         percpu_up_read_preempt_enable(&c->mark_lock);
1005         return 0;
1006
1007 recalculate:
1008         /*
1009          * GC recalculates sectors_available when it starts, so that hopefully
1010          * we don't normally end up blocking here:
1011          */
1012
1013         /*
1014          * Piss fuck, we can be called from extent_insert_fixup() with btree
1015          * locks held:
1016          */
1017
1018         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
1019                 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
1020                         down_read(&c->gc_lock);
1021                 else if (!down_read_trylock(&c->gc_lock))
1022                         return -EINTR;
1023         }
1024
1025         percpu_down_write(&c->mark_lock);
1026         sectors_available = bch2_recalc_sectors_available(c);
1027
1028         if (sectors <= sectors_available ||
1029             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1030                 atomic64_set(&c->sectors_available,
1031                              max_t(s64, 0, sectors_available - sectors));
1032                 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1033                 res->sectors                    += sectors;
1034                 ret = 0;
1035         } else {
1036                 atomic64_set(&c->sectors_available, sectors_available);
1037                 ret = -ENOSPC;
1038         }
1039
1040         percpu_up_write(&c->mark_lock);
1041
1042         if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
1043                 up_read(&c->gc_lock);
1044
1045         return ret;
1046 }
1047
1048 /* Startup/shutdown: */
1049
1050 static void buckets_free_rcu(struct rcu_head *rcu)
1051 {
1052         struct bucket_array *buckets =
1053                 container_of(rcu, struct bucket_array, rcu);
1054
1055         kvpfree(buckets,
1056                 sizeof(struct bucket_array) +
1057                 buckets->nbuckets * sizeof(struct bucket));
1058 }
1059
1060 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1061 {
1062         struct bucket_array *buckets = NULL, *old_buckets = NULL;
1063         unsigned long *buckets_nouse = NULL;
1064         unsigned long *buckets_written = NULL;
1065         u8 *oldest_gens = NULL;
1066         alloc_fifo      free[RESERVE_NR];
1067         alloc_fifo      free_inc;
1068         alloc_heap      alloc_heap;
1069         copygc_heap     copygc_heap;
1070
1071         size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1072                              ca->mi.bucket_size / c->opts.btree_node_size);
1073         /* XXX: these should be tunable */
1074         size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
1075         size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
1076         size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
1077                                       btree_reserve * 2);
1078         bool resize = ca->buckets[0] != NULL,
1079              start_copygc = ca->copygc_thread != NULL;
1080         int ret = -ENOMEM;
1081         unsigned i;
1082
1083         memset(&free,           0, sizeof(free));
1084         memset(&free_inc,       0, sizeof(free_inc));
1085         memset(&alloc_heap,     0, sizeof(alloc_heap));
1086         memset(&copygc_heap,    0, sizeof(copygc_heap));
1087
1088         if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
1089                                             nbuckets * sizeof(struct bucket),
1090                                             GFP_KERNEL|__GFP_ZERO)) ||
1091             !(oldest_gens       = kvpmalloc(nbuckets * sizeof(u8),
1092                                             GFP_KERNEL|__GFP_ZERO)) ||
1093             !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1094                                             sizeof(unsigned long),
1095                                             GFP_KERNEL|__GFP_ZERO)) ||
1096             !(buckets_written   = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1097                                             sizeof(unsigned long),
1098                                             GFP_KERNEL|__GFP_ZERO)) ||
1099             !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1100             !init_fifo(&free[RESERVE_MOVINGGC],
1101                        copygc_reserve, GFP_KERNEL) ||
1102             !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1103             !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
1104             !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1105             !init_heap(&copygc_heap,    copygc_reserve, GFP_KERNEL))
1106                 goto err;
1107
1108         buckets->first_bucket   = ca->mi.first_bucket;
1109         buckets->nbuckets       = nbuckets;
1110
1111         bch2_copygc_stop(ca);
1112
1113         if (resize) {
1114                 down_write(&c->gc_lock);
1115                 down_write(&ca->bucket_lock);
1116                 percpu_down_write(&c->mark_lock);
1117         }
1118
1119         old_buckets = bucket_array(ca);
1120
1121         if (resize) {
1122                 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1123
1124                 memcpy(buckets->b,
1125                        old_buckets->b,
1126                        n * sizeof(struct bucket));
1127                 memcpy(oldest_gens,
1128                        ca->oldest_gens,
1129                        n * sizeof(u8));
1130                 memcpy(buckets_nouse,
1131                        ca->buckets_nouse,
1132                        BITS_TO_LONGS(n) * sizeof(unsigned long));
1133                 memcpy(buckets_written,
1134                        ca->buckets_written,
1135                        BITS_TO_LONGS(n) * sizeof(unsigned long));
1136         }
1137
1138         rcu_assign_pointer(ca->buckets[0], buckets);
1139         buckets = old_buckets;
1140
1141         swap(ca->oldest_gens, oldest_gens);
1142         swap(ca->buckets_nouse, buckets_nouse);
1143         swap(ca->buckets_written, buckets_written);
1144
1145         if (resize)
1146                 percpu_up_write(&c->mark_lock);
1147
1148         spin_lock(&c->freelist_lock);
1149         for (i = 0; i < RESERVE_NR; i++) {
1150                 fifo_move(&free[i], &ca->free[i]);
1151                 swap(ca->free[i], free[i]);
1152         }
1153         fifo_move(&free_inc, &ca->free_inc);
1154         swap(ca->free_inc, free_inc);
1155         spin_unlock(&c->freelist_lock);
1156
1157         /* with gc lock held, alloc_heap can't be in use: */
1158         swap(ca->alloc_heap, alloc_heap);
1159
1160         /* and we shut down copygc: */
1161         swap(ca->copygc_heap, copygc_heap);
1162
1163         nbuckets = ca->mi.nbuckets;
1164
1165         if (resize) {
1166                 up_write(&ca->bucket_lock);
1167                 up_write(&c->gc_lock);
1168         }
1169
1170         if (start_copygc &&
1171             bch2_copygc_start(c, ca))
1172                 bch_err(ca, "error restarting copygc thread");
1173
1174         ret = 0;
1175 err:
1176         free_heap(&copygc_heap);
1177         free_heap(&alloc_heap);
1178         free_fifo(&free_inc);
1179         for (i = 0; i < RESERVE_NR; i++)
1180                 free_fifo(&free[i]);
1181         kvpfree(buckets_nouse,
1182                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1183         kvpfree(buckets_written,
1184                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1185         kvpfree(oldest_gens,
1186                 nbuckets * sizeof(u8));
1187         if (buckets)
1188                 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1189
1190         return ret;
1191 }
1192
1193 void bch2_dev_buckets_free(struct bch_dev *ca)
1194 {
1195         unsigned i;
1196
1197         free_heap(&ca->copygc_heap);
1198         free_heap(&ca->alloc_heap);
1199         free_fifo(&ca->free_inc);
1200         for (i = 0; i < RESERVE_NR; i++)
1201                 free_fifo(&ca->free[i]);
1202         kvpfree(ca->buckets_written,
1203                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1204         kvpfree(ca->buckets_nouse,
1205                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1206         kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
1207         kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1208                 sizeof(struct bucket_array) +
1209                 ca->mi.nbuckets * sizeof(struct bucket));
1210
1211         free_percpu(ca->usage[0]);
1212 }
1213
1214 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1215 {
1216         if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1217                 return -ENOMEM;
1218
1219         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
1220 }