]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/buckets.c
New upstream snapshot
[bcachefs-tools-debian] / libbcachefs / buckets.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code for manipulating bucket marks for garbage collection.
4  *
5  * Copyright 2014 Datera, Inc.
6  *
7  * Bucket states:
8  * - free bucket: mark == 0
9  *   The bucket contains no data and will not be read
10  *
11  * - allocator bucket: owned_by_allocator == 1
12  *   The bucket is on a free list, or it is an open bucket
13  *
14  * - cached bucket: owned_by_allocator == 0 &&
15  *                  dirty_sectors == 0 &&
16  *                  cached_sectors > 0
17  *   The bucket contains data but may be safely discarded as there are
18  *   enough replicas of the data on other cache devices, or it has been
19  *   written back to the backing device
20  *
21  * - dirty bucket: owned_by_allocator == 0 &&
22  *                 dirty_sectors > 0
23  *   The bucket contains data that we must not discard (either only copy,
24  *   or one of the 'main copies' for data requiring multiple replicas)
25  *
26  * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
27  *   This is a btree node, journal or gen/prio bucket
28  *
29  * Lifecycle:
30  *
31  * bucket invalidated => bucket on freelist => open bucket =>
32  *     [dirty bucket =>] cached bucket => bucket invalidated => ...
33  *
34  * Note that cache promotion can skip the dirty bucket step, as data
35  * is copied from a deeper tier to a shallower tier, onto a cached
36  * bucket.
37  * Note also that a cached bucket can spontaneously become dirty --
38  * see below.
39  *
40  * Only a traversal of the key space can determine whether a bucket is
41  * truly dirty or cached.
42  *
43  * Transitions:
44  *
45  * - free => allocator: bucket was invalidated
46  * - cached => allocator: bucket was invalidated
47  *
48  * - allocator => dirty: open bucket was filled up
49  * - allocator => cached: open bucket was filled up
50  * - allocator => metadata: metadata was allocated
51  *
52  * - dirty => cached: dirty sectors were copied to a deeper tier
53  * - dirty => free: dirty sectors were overwritten or moved (copy gc)
54  * - cached => free: cached sectors were overwritten
55  *
56  * - metadata => free: metadata was freed
57  *
58  * Oddities:
59  * - cached => dirty: a device was removed so formerly replicated data
60  *                    is no longer sufficiently replicated
61  * - free => cached: cannot happen
62  * - free => dirty: cannot happen
63  * - free => metadata: cannot happen
64  */
65
66 #include "bcachefs.h"
67 #include "alloc_background.h"
68 #include "bset.h"
69 #include "btree_gc.h"
70 #include "btree_update.h"
71 #include "buckets.h"
72 #include "ec.h"
73 #include "error.h"
74 #include "movinggc.h"
75 #include "replicas.h"
76
77 #include <linux/preempt.h>
78 #include <trace/events/bcachefs.h>
79
80 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
81                                               enum bch_data_type data_type,
82                                               s64 sectors)
83 {
84         switch (data_type) {
85         case BCH_DATA_btree:
86                 fs_usage->btree         += sectors;
87                 break;
88         case BCH_DATA_user:
89         case BCH_DATA_parity:
90                 fs_usage->data          += sectors;
91                 break;
92         case BCH_DATA_cached:
93                 fs_usage->cached        += sectors;
94                 break;
95         default:
96                 break;
97         }
98 }
99
100 /*
101  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
102  * wraparound:
103  */
104 void bch2_bucket_seq_cleanup(struct bch_fs *c)
105 {
106         u64 journal_seq = atomic64_read(&c->journal.seq);
107         u16 last_seq_ondisk = c->journal.last_seq_ondisk;
108         struct bch_dev *ca;
109         struct bucket_array *buckets;
110         struct bucket *g;
111         struct bucket_mark m;
112         unsigned i;
113
114         if (journal_seq - c->last_bucket_seq_cleanup <
115             (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
116                 return;
117
118         c->last_bucket_seq_cleanup = journal_seq;
119
120         for_each_member_device(ca, c, i) {
121                 down_read(&ca->bucket_lock);
122                 buckets = bucket_array(ca);
123
124                 for_each_bucket(g, buckets) {
125                         bucket_cmpxchg(g, m, ({
126                                 if (!m.journal_seq_valid ||
127                                     bucket_needs_journal_commit(m, last_seq_ondisk))
128                                         break;
129
130                                 m.journal_seq_valid = 0;
131                         }));
132                 }
133                 up_read(&ca->bucket_lock);
134         }
135 }
136
137 void bch2_fs_usage_initialize(struct bch_fs *c)
138 {
139         struct bch_fs_usage *usage;
140         struct bch_dev *ca;
141         unsigned i;
142
143         percpu_down_write(&c->mark_lock);
144         usage = c->usage_base;
145
146         for (i = 0; i < ARRAY_SIZE(c->usage); i++)
147                 bch2_fs_usage_acc_to_base(c, i);
148
149         for (i = 0; i < BCH_REPLICAS_MAX; i++)
150                 usage->reserved += usage->persistent_reserved[i];
151
152         for (i = 0; i < c->replicas.nr; i++) {
153                 struct bch_replicas_entry *e =
154                         cpu_replicas_entry(&c->replicas, i);
155
156                 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
157         }
158
159         for_each_member_device(ca, c, i) {
160                 struct bch_dev_usage dev = bch2_dev_usage_read(ca);
161
162                 usage->hidden += (dev.d[BCH_DATA_sb].buckets +
163                                   dev.d[BCH_DATA_journal].buckets) *
164                         ca->mi.bucket_size;
165         }
166
167         percpu_up_write(&c->mark_lock);
168 }
169
170 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
171                                                   unsigned journal_seq,
172                                                   bool gc)
173 {
174         return this_cpu_ptr(gc
175                             ? ca->usage_gc
176                             : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
177 }
178
179 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
180 {
181         struct bch_fs *c = ca->fs;
182         struct bch_dev_usage ret;
183         unsigned seq, i, u64s = dev_usage_u64s();
184
185         do {
186                 seq = read_seqcount_begin(&c->usage_lock);
187                 memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
188                 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
189                         acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
190         } while (read_seqcount_retry(&c->usage_lock, seq));
191
192         return ret;
193 }
194
195 static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
196                                                 unsigned journal_seq,
197                                                 bool gc)
198 {
199         return this_cpu_ptr(gc
200                             ? c->usage_gc
201                             : c->usage[journal_seq & JOURNAL_BUF_MASK]);
202 }
203
204 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
205 {
206         ssize_t offset = v - (u64 *) c->usage_base;
207         unsigned i, seq;
208         u64 ret;
209
210         BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
211         percpu_rwsem_assert_held(&c->mark_lock);
212
213         do {
214                 seq = read_seqcount_begin(&c->usage_lock);
215                 ret = *v;
216
217                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
218                         ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
219         } while (read_seqcount_retry(&c->usage_lock, seq));
220
221         return ret;
222 }
223
224 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
225 {
226         struct bch_fs_usage_online *ret;
227         unsigned seq, i, u64s;
228
229         percpu_down_read(&c->mark_lock);
230
231         ret = kmalloc(sizeof(struct bch_fs_usage_online) +
232                       sizeof(u64) + c->replicas.nr, GFP_NOFS);
233         if (unlikely(!ret)) {
234                 percpu_up_read(&c->mark_lock);
235                 return NULL;
236         }
237
238         ret->online_reserved = percpu_u64_get(c->online_reserved);
239
240         u64s = fs_usage_u64s(c);
241         do {
242                 seq = read_seqcount_begin(&c->usage_lock);
243                 memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
244                 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
245                         acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
246         } while (read_seqcount_retry(&c->usage_lock, seq));
247
248         return ret;
249 }
250
251 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
252 {
253         struct bch_dev *ca;
254         unsigned i, u64s = fs_usage_u64s(c);
255
256         BUG_ON(idx >= ARRAY_SIZE(c->usage));
257
258         preempt_disable();
259         write_seqcount_begin(&c->usage_lock);
260
261         acc_u64s_percpu((u64 *) c->usage_base,
262                         (u64 __percpu *) c->usage[idx], u64s);
263         percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
264
265         rcu_read_lock();
266         for_each_member_device_rcu(ca, c, i, NULL) {
267                 u64s = dev_usage_u64s();
268
269                 acc_u64s_percpu((u64 *) ca->usage_base,
270                                 (u64 __percpu *) ca->usage[idx], u64s);
271                 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
272         }
273         rcu_read_unlock();
274
275         write_seqcount_end(&c->usage_lock);
276         preempt_enable();
277 }
278
279 void bch2_fs_usage_to_text(struct printbuf *out,
280                            struct bch_fs *c,
281                            struct bch_fs_usage_online *fs_usage)
282 {
283         unsigned i;
284
285         pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
286
287         pr_buf(out, "hidden:\t\t\t\t%llu\n",
288                fs_usage->u.hidden);
289         pr_buf(out, "data:\t\t\t\t%llu\n",
290                fs_usage->u.data);
291         pr_buf(out, "cached:\t\t\t\t%llu\n",
292                fs_usage->u.cached);
293         pr_buf(out, "reserved:\t\t\t%llu\n",
294                fs_usage->u.reserved);
295         pr_buf(out, "nr_inodes:\t\t\t%llu\n",
296                fs_usage->u.nr_inodes);
297         pr_buf(out, "online reserved:\t\t%llu\n",
298                fs_usage->online_reserved);
299
300         for (i = 0;
301              i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
302              i++) {
303                 pr_buf(out, "%u replicas:\n", i + 1);
304                 pr_buf(out, "\treserved:\t\t%llu\n",
305                        fs_usage->u.persistent_reserved[i]);
306         }
307
308         for (i = 0; i < c->replicas.nr; i++) {
309                 struct bch_replicas_entry *e =
310                         cpu_replicas_entry(&c->replicas, i);
311
312                 pr_buf(out, "\t");
313                 bch2_replicas_entry_to_text(out, e);
314                 pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
315         }
316 }
317
318 #define RESERVE_FACTOR  6
319
320 static u64 reserve_factor(u64 r)
321 {
322         return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
323 }
324
325 static u64 avail_factor(u64 r)
326 {
327         return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
328 }
329
330 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
331 {
332         return min(fs_usage->u.hidden +
333                    fs_usage->u.btree +
334                    fs_usage->u.data +
335                    reserve_factor(fs_usage->u.reserved +
336                                   fs_usage->online_reserved),
337                    c->capacity);
338 }
339
340 static struct bch_fs_usage_short
341 __bch2_fs_usage_read_short(struct bch_fs *c)
342 {
343         struct bch_fs_usage_short ret;
344         u64 data, reserved;
345
346         ret.capacity = c->capacity -
347                 bch2_fs_usage_read_one(c, &c->usage_base->hidden);
348
349         data            = bch2_fs_usage_read_one(c, &c->usage_base->data) +
350                 bch2_fs_usage_read_one(c, &c->usage_base->btree);
351         reserved        = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
352                 percpu_u64_get(c->online_reserved);
353
354         ret.used        = min(ret.capacity, data + reserve_factor(reserved));
355         ret.free        = ret.capacity - ret.used;
356
357         ret.nr_inodes   = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
358
359         return ret;
360 }
361
362 struct bch_fs_usage_short
363 bch2_fs_usage_read_short(struct bch_fs *c)
364 {
365         struct bch_fs_usage_short ret;
366
367         percpu_down_read(&c->mark_lock);
368         ret = __bch2_fs_usage_read_short(c);
369         percpu_up_read(&c->mark_lock);
370
371         return ret;
372 }
373
374 static inline int is_unavailable_bucket(struct bucket_mark m)
375 {
376         return !is_available_bucket(m);
377 }
378
379 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
380                                             struct bucket_mark m)
381 {
382         return bucket_sectors_used(m)
383                 ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
384                 : 0;
385 }
386
387 static inline int is_stripe_data_bucket(struct bucket_mark m)
388 {
389         return m.stripe && m.data_type != BCH_DATA_parity;
390 }
391
392 static inline enum bch_data_type bucket_type(struct bucket_mark m)
393 {
394         return m.cached_sectors && !m.dirty_sectors
395                 ? BCH_DATA_cached
396                 : m.data_type;
397 }
398
399 static bool bucket_became_unavailable(struct bucket_mark old,
400                                       struct bucket_mark new)
401 {
402         return is_available_bucket(old) &&
403                !is_available_bucket(new);
404 }
405
406 static inline void account_bucket(struct bch_fs_usage *fs_usage,
407                                   struct bch_dev_usage *dev_usage,
408                                   enum bch_data_type type,
409                                   int nr, s64 size)
410 {
411         if (type == BCH_DATA_sb || type == BCH_DATA_journal)
412                 fs_usage->hidden        += size;
413
414         dev_usage->d[type].buckets      += nr;
415 }
416
417 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
418                                   struct bch_fs_usage *fs_usage,
419                                   struct bucket_mark old, struct bucket_mark new,
420                                   u64 journal_seq, bool gc)
421 {
422         struct bch_dev_usage *u;
423
424         percpu_rwsem_assert_held(&c->mark_lock);
425
426         preempt_disable();
427         if (!fs_usage)
428                 fs_usage = fs_usage_ptr(c, journal_seq, gc);
429         u = dev_usage_ptr(ca, journal_seq, gc);
430
431         if (bucket_type(old))
432                 account_bucket(fs_usage, u, bucket_type(old),
433                                -1, -ca->mi.bucket_size);
434
435         if (bucket_type(new))
436                 account_bucket(fs_usage, u, bucket_type(new),
437                                1, ca->mi.bucket_size);
438
439         u->buckets_ec += (int) new.stripe - (int) old.stripe;
440         u->buckets_unavailable +=
441                 is_unavailable_bucket(new) - is_unavailable_bucket(old);
442
443         u->d[old.data_type].sectors -= old.dirty_sectors;
444         u->d[new.data_type].sectors += new.dirty_sectors;
445         u->d[BCH_DATA_cached].sectors +=
446                 (int) new.cached_sectors - (int) old.cached_sectors;
447
448         u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
449         u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
450
451         preempt_enable();
452
453         if (!is_available_bucket(old) && is_available_bucket(new))
454                 bch2_wake_allocator(ca);
455 }
456
457 static inline void update_replicas(struct bch_fs *c,
458                                    struct bch_fs_usage *fs_usage,
459                                    struct bch_replicas_entry *r,
460                                    s64 sectors)
461 {
462         int idx = bch2_replicas_entry_idx(c, r);
463
464         BUG_ON(idx < 0);
465
466         fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
467         fs_usage->replicas[idx]         += sectors;
468 }
469
470 static inline void update_cached_sectors(struct bch_fs *c,
471                                          struct bch_fs_usage *fs_usage,
472                                          unsigned dev, s64 sectors)
473 {
474         struct bch_replicas_padded r;
475
476         bch2_replicas_entry_cached(&r.e, dev);
477
478         update_replicas(c, fs_usage, &r.e, sectors);
479 }
480
481 static struct replicas_delta_list *
482 replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
483 {
484         struct replicas_delta_list *d = trans->fs_usage_deltas;
485         unsigned new_size = d ? (d->size + more) * 2 : 128;
486
487         if (!d || d->used + more > d->size) {
488                 d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
489                 BUG_ON(!d);
490
491                 d->size = new_size;
492                 trans->fs_usage_deltas = d;
493         }
494         return d;
495 }
496
497 static inline void update_replicas_list(struct btree_trans *trans,
498                                         struct bch_replicas_entry *r,
499                                         s64 sectors)
500 {
501         struct replicas_delta_list *d;
502         struct replicas_delta *n;
503         unsigned b;
504
505         if (!sectors)
506                 return;
507
508         b = replicas_entry_bytes(r) + 8;
509         d = replicas_deltas_realloc(trans, b);
510
511         n = (void *) d->d + d->used;
512         n->delta = sectors;
513         memcpy(&n->r, r, replicas_entry_bytes(r));
514         bch2_replicas_entry_sort(&n->r);
515         d->used += b;
516 }
517
518 static inline void update_cached_sectors_list(struct btree_trans *trans,
519                                               unsigned dev, s64 sectors)
520 {
521         struct bch_replicas_padded r;
522
523         bch2_replicas_entry_cached(&r.e, dev);
524
525         update_replicas_list(trans, &r.e, sectors);
526 }
527
528 #define do_mark_fn(fn, c, pos, flags, ...)                              \
529 ({                                                                      \
530         int gc, ret = 0;                                                \
531                                                                         \
532         percpu_rwsem_assert_held(&c->mark_lock);                        \
533                                                                         \
534         for (gc = 0; gc < 2 && !ret; gc++)                              \
535                 if (!gc == !(flags & BTREE_TRIGGER_GC) ||               \
536                     (gc && gc_visited(c, pos)))                         \
537                         ret = fn(c, __VA_ARGS__, gc);                   \
538         ret;                                                            \
539 })
540
541 static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
542                                     size_t b, bool owned_by_allocator,
543                                     bool gc)
544 {
545         struct bucket *g = __bucket(ca, b, gc);
546         struct bucket_mark old, new;
547
548         old = bucket_cmpxchg(g, new, ({
549                 new.owned_by_allocator  = owned_by_allocator;
550         }));
551
552         BUG_ON(!gc &&
553                !owned_by_allocator && !old.owned_by_allocator);
554
555         return 0;
556 }
557
558 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
559                             size_t b, bool owned_by_allocator,
560                             struct gc_pos pos, unsigned flags)
561 {
562         preempt_disable();
563
564         do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
565                    ca, b, owned_by_allocator);
566
567         preempt_enable();
568 }
569
570 static int bch2_mark_alloc(struct bch_fs *c,
571                            struct bkey_s_c old, struct bkey_s_c new,
572                            struct bch_fs_usage *fs_usage,
573                            u64 journal_seq, unsigned flags)
574 {
575         bool gc = flags & BTREE_TRIGGER_GC;
576         struct bkey_alloc_unpacked u;
577         struct bch_dev *ca;
578         struct bucket *g;
579         struct bucket_mark old_m, m;
580
581         /* We don't do anything for deletions - do we?: */
582         if (new.k->type != KEY_TYPE_alloc &&
583             new.k->type != KEY_TYPE_alloc_v2)
584                 return 0;
585
586         /*
587          * alloc btree is read in by bch2_alloc_read, not gc:
588          */
589         if ((flags & BTREE_TRIGGER_GC) &&
590             !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
591                 return 0;
592
593         ca = bch_dev_bkey_exists(c, new.k->p.inode);
594
595         if (new.k->p.offset >= ca->mi.nbuckets)
596                 return 0;
597
598         g = __bucket(ca, new.k->p.offset, gc);
599         u = bch2_alloc_unpack(new);
600
601         old_m = bucket_cmpxchg(g, m, ({
602                 m.gen                   = u.gen;
603                 m.data_type             = u.data_type;
604                 m.dirty_sectors         = u.dirty_sectors;
605                 m.cached_sectors        = u.cached_sectors;
606                 m.stripe                = u.stripe != 0;
607
608                 if (journal_seq) {
609                         m.journal_seq_valid     = 1;
610                         m.journal_seq           = journal_seq;
611                 }
612         }));
613
614         bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
615
616         g->io_time[READ]        = u.read_time;
617         g->io_time[WRITE]       = u.write_time;
618         g->oldest_gen           = u.oldest_gen;
619         g->gen_valid            = 1;
620         g->stripe               = u.stripe;
621         g->stripe_redundancy    = u.stripe_redundancy;
622
623         /*
624          * need to know if we're getting called from the invalidate path or
625          * not:
626          */
627
628         if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
629             old_m.cached_sectors) {
630                 update_cached_sectors(c, fs_usage, ca->dev_idx,
631                                       -old_m.cached_sectors);
632                 trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
633                                  old_m.cached_sectors);
634         }
635
636         return 0;
637 }
638
639 #define checked_add(a, b)                                       \
640 ({                                                              \
641         unsigned _res = (unsigned) (a) + (b);                   \
642         bool overflow = _res > U16_MAX;                         \
643         if (overflow)                                           \
644                 _res = U16_MAX;                                 \
645         (a) = _res;                                             \
646         overflow;                                               \
647 })
648
649 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
650                                        size_t b, enum bch_data_type data_type,
651                                        unsigned sectors, bool gc)
652 {
653         struct bucket *g = __bucket(ca, b, gc);
654         struct bucket_mark old, new;
655         bool overflow;
656
657         BUG_ON(data_type != BCH_DATA_sb &&
658                data_type != BCH_DATA_journal);
659
660         old = bucket_cmpxchg(g, new, ({
661                 new.data_type   = data_type;
662                 overflow = checked_add(new.dirty_sectors, sectors);
663         }));
664
665         bch2_fs_inconsistent_on(old.data_type &&
666                                 old.data_type != data_type, c,
667                 "different types of data in same bucket: %s, %s",
668                 bch2_data_types[old.data_type],
669                 bch2_data_types[data_type]);
670
671         bch2_fs_inconsistent_on(overflow, c,
672                 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
673                 ca->dev_idx, b, new.gen,
674                 bch2_data_types[old.data_type ?: data_type],
675                 old.dirty_sectors, sectors);
676
677         if (c)
678                 bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
679                                       old, new, 0, gc);
680
681         return 0;
682 }
683
684 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
685                                size_t b, enum bch_data_type type,
686                                unsigned sectors, struct gc_pos pos,
687                                unsigned flags)
688 {
689         BUG_ON(type != BCH_DATA_sb &&
690                type != BCH_DATA_journal);
691
692         preempt_disable();
693
694         if (likely(c)) {
695                 do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
696                            ca, b, type, sectors);
697         } else {
698                 __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
699         }
700
701         preempt_enable();
702 }
703
704 static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
705 {
706         return DIV_ROUND_UP(sectors * n, d);
707 }
708
709 static s64 __ptr_disk_sectors_delta(unsigned old_size,
710                                     unsigned offset, s64 delta,
711                                     unsigned flags,
712                                     unsigned n, unsigned d)
713 {
714         BUG_ON(!n || !d);
715
716         if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
717                 BUG_ON(offset + -delta > old_size);
718
719                 return -disk_sectors_scaled(n, d, old_size) +
720                         disk_sectors_scaled(n, d, offset) +
721                         disk_sectors_scaled(n, d, old_size - offset + delta);
722         } else if (flags & BTREE_TRIGGER_OVERWRITE) {
723                 BUG_ON(offset + -delta > old_size);
724
725                 return -disk_sectors_scaled(n, d, old_size) +
726                         disk_sectors_scaled(n, d, old_size + delta);
727         } else {
728                 return  disk_sectors_scaled(n, d, delta);
729         }
730 }
731
732 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
733                                   unsigned offset, s64 delta,
734                                   unsigned flags)
735 {
736         return __ptr_disk_sectors_delta(p.crc.live_size,
737                                         offset, delta, flags,
738                                         p.crc.compressed_size,
739                                         p.crc.uncompressed_size);
740 }
741
742 static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
743                             const struct bch_extent_ptr *ptr,
744                             s64 sectors, enum bch_data_type ptr_data_type,
745                             u8 bucket_gen, u8 bucket_data_type,
746                             u16 dirty_sectors, u16 cached_sectors)
747 {
748         size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
749         u16 bucket_sectors = !ptr->cached
750                 ? dirty_sectors
751                 : cached_sectors;
752         char buf[200];
753
754         if (gen_after(ptr->gen, bucket_gen)) {
755                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
756                         "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
757                         "while marking %s",
758                         ptr->dev, bucket_nr, bucket_gen,
759                         bch2_data_types[bucket_data_type ?: ptr_data_type],
760                         ptr->gen,
761                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
762                 return -EIO;
763         }
764
765         if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
766                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
767                         "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
768                         "while marking %s",
769                         ptr->dev, bucket_nr, bucket_gen,
770                         bch2_data_types[bucket_data_type ?: ptr_data_type],
771                         ptr->gen,
772                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
773                 return -EIO;
774         }
775
776         if (bucket_gen != ptr->gen && !ptr->cached) {
777                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
778                         "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
779                         "while marking %s",
780                         ptr->dev, bucket_nr, bucket_gen,
781                         bch2_data_types[bucket_data_type ?: ptr_data_type],
782                         ptr->gen,
783                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
784                 return -EIO;
785         }
786
787         if (bucket_gen != ptr->gen)
788                 return 1;
789
790         if (bucket_data_type && ptr_data_type &&
791             bucket_data_type != ptr_data_type) {
792                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
793                         "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
794                         "while marking %s",
795                         ptr->dev, bucket_nr, bucket_gen,
796                         bch2_data_types[bucket_data_type],
797                         bch2_data_types[ptr_data_type],
798                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
799                 return -EIO;
800         }
801
802         if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
803                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
804                         "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
805                         "while marking %s",
806                         ptr->dev, bucket_nr, bucket_gen,
807                         bch2_data_types[bucket_data_type ?: ptr_data_type],
808                         bucket_sectors, sectors,
809                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
810                 return -EIO;
811         }
812
813         return 0;
814 }
815
816 static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
817                              unsigned ptr_idx,
818                              struct bch_fs_usage *fs_usage,
819                              u64 journal_seq, unsigned flags)
820 {
821         const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
822         unsigned nr_data = s->nr_blocks - s->nr_redundant;
823         bool parity = ptr_idx >= nr_data;
824         const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
825         bool gc = flags & BTREE_TRIGGER_GC;
826         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
827         struct bucket *g = PTR_BUCKET(ca, ptr, gc);
828         struct bucket_mark new, old;
829         char buf[200];
830         int ret;
831
832         if (g->stripe && g->stripe != k.k->p.offset) {
833                 bch2_fs_inconsistent(c,
834                               "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
835                               ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
836                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
837                 return -EINVAL;
838         }
839
840         old = bucket_cmpxchg(g, new, ({
841                 ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
842                                        new.dirty_sectors, new.cached_sectors);
843                 if (ret)
844                         return ret;
845
846                 if (parity) {
847                         new.data_type           = BCH_DATA_parity;
848                         new.dirty_sectors       = le16_to_cpu(s->sectors);
849                 }
850
851                 if (journal_seq) {
852                         new.journal_seq_valid   = 1;
853                         new.journal_seq         = journal_seq;
854                 }
855         }));
856
857         g->stripe               = k.k->p.offset;
858         g->stripe_redundancy    = s->nr_redundant;
859
860         bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
861         return 0;
862 }
863
864 static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
865                           const struct bch_extent_ptr *ptr,
866                           s64 sectors, enum bch_data_type ptr_data_type,
867                           u8 bucket_gen, u8 *bucket_data_type,
868                           u16 *dirty_sectors, u16 *cached_sectors)
869 {
870         u16 *dst_sectors = !ptr->cached
871                 ? dirty_sectors
872                 : cached_sectors;
873         int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
874                                    bucket_gen, *bucket_data_type,
875                                    *dirty_sectors, *cached_sectors);
876
877         if (ret)
878                 return ret;
879
880         *dst_sectors += sectors;
881         *bucket_data_type = *dirty_sectors || *cached_sectors
882                 ? ptr_data_type : 0;
883         return 0;
884 }
885
886 static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
887                              struct extent_ptr_decoded p,
888                              s64 sectors, enum bch_data_type data_type,
889                              struct bch_fs_usage *fs_usage,
890                              u64 journal_seq, unsigned flags)
891 {
892         bool gc = flags & BTREE_TRIGGER_GC;
893         struct bucket_mark old, new;
894         struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
895         struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
896         u8 bucket_data_type;
897         u64 v;
898         int ret;
899
900         v = atomic64_read(&g->_mark.v);
901         do {
902                 new.v.counter = old.v.counter = v;
903                 bucket_data_type = new.data_type;
904
905                 ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
906                                      &bucket_data_type,
907                                      &new.dirty_sectors,
908                                      &new.cached_sectors);
909                 if (ret)
910                         return ret;
911
912                 new.data_type = bucket_data_type;
913
914                 if (journal_seq) {
915                         new.journal_seq_valid = 1;
916                         new.journal_seq = journal_seq;
917                 }
918
919                 if (flags & BTREE_TRIGGER_NOATOMIC) {
920                         g->_mark = new;
921                         break;
922                 }
923         } while ((v = atomic64_cmpxchg(&g->_mark.v,
924                               old.v.counter,
925                               new.v.counter)) != old.v.counter);
926
927         bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
928
929         BUG_ON(!gc && bucket_became_unavailable(old, new));
930
931         return 0;
932 }
933
934 static int bch2_mark_stripe_ptr(struct bch_fs *c,
935                                 struct bch_extent_stripe_ptr p,
936                                 enum bch_data_type data_type,
937                                 struct bch_fs_usage *fs_usage,
938                                 s64 sectors, unsigned flags)
939 {
940         bool gc = flags & BTREE_TRIGGER_GC;
941         struct bch_replicas_padded r;
942         struct stripe *m;
943         unsigned i, blocks_nonempty = 0;
944
945         m = genradix_ptr(&c->stripes[gc], p.idx);
946
947         spin_lock(&c->ec_stripes_heap_lock);
948
949         if (!m || !m->alive) {
950                 spin_unlock(&c->ec_stripes_heap_lock);
951                 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
952                                     (u64) p.idx);
953                 return -EIO;
954         }
955
956         m->block_sectors[p.block] += sectors;
957
958         r = m->r;
959
960         for (i = 0; i < m->nr_blocks; i++)
961                 blocks_nonempty += m->block_sectors[i] != 0;
962
963         if (m->blocks_nonempty != blocks_nonempty) {
964                 m->blocks_nonempty = blocks_nonempty;
965                 if (!gc)
966                         bch2_stripes_heap_update(c, m, p.idx);
967         }
968
969         spin_unlock(&c->ec_stripes_heap_lock);
970
971         r.e.data_type = data_type;
972         update_replicas(c, fs_usage, &r.e, sectors);
973
974         return 0;
975 }
976
977 static int bch2_mark_extent(struct bch_fs *c,
978                             struct bkey_s_c old, struct bkey_s_c new,
979                             unsigned offset, s64 sectors,
980                             enum bch_data_type data_type,
981                             struct bch_fs_usage *fs_usage,
982                             unsigned journal_seq, unsigned flags)
983 {
984         struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
985         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
986         const union bch_extent_entry *entry;
987         struct extent_ptr_decoded p;
988         struct bch_replicas_padded r;
989         s64 dirty_sectors = 0;
990         bool stale;
991         int ret;
992
993         r.e.data_type   = data_type;
994         r.e.nr_devs     = 0;
995         r.e.nr_required = 1;
996
997         BUG_ON(!sectors);
998
999         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1000                 s64 disk_sectors = data_type == BCH_DATA_btree
1001                         ? sectors
1002                         : ptr_disk_sectors_delta(p, offset, sectors, flags);
1003
1004                 ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
1005                                         fs_usage, journal_seq, flags);
1006                 if (ret < 0)
1007                         return ret;
1008
1009                 stale = ret > 0;
1010
1011                 if (p.ptr.cached) {
1012                         if (!stale)
1013                                 update_cached_sectors(c, fs_usage, p.ptr.dev,
1014                                                       disk_sectors);
1015                 } else if (!p.has_ec) {
1016                         dirty_sectors          += disk_sectors;
1017                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1018                 } else {
1019                         ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
1020                                         fs_usage, disk_sectors, flags);
1021                         if (ret)
1022                                 return ret;
1023
1024                         /*
1025                          * There may be other dirty pointers in this extent, but
1026                          * if so they're not required for mounting if we have an
1027                          * erasure coded pointer in this extent:
1028                          */
1029                         r.e.nr_required = 0;
1030                 }
1031         }
1032
1033         if (r.e.nr_devs)
1034                 update_replicas(c, fs_usage, &r.e, dirty_sectors);
1035
1036         return 0;
1037 }
1038
1039 static int bch2_mark_stripe(struct bch_fs *c,
1040                             struct bkey_s_c old, struct bkey_s_c new,
1041                             struct bch_fs_usage *fs_usage,
1042                             u64 journal_seq, unsigned flags)
1043 {
1044         bool gc = flags & BTREE_TRIGGER_GC;
1045         size_t idx = new.k->p.offset;
1046         const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
1047                 ? bkey_s_c_to_stripe(old).v : NULL;
1048         const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
1049                 ? bkey_s_c_to_stripe(new).v : NULL;
1050         struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
1051         unsigned i;
1052         int ret;
1053
1054         BUG_ON(gc && old_s);
1055
1056         if (!m || (old_s && !m->alive)) {
1057                 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
1058                                     idx);
1059                 return -1;
1060         }
1061
1062         if (!new_s) {
1063                 spin_lock(&c->ec_stripes_heap_lock);
1064                 bch2_stripes_heap_del(c, m, idx);
1065                 spin_unlock(&c->ec_stripes_heap_lock);
1066
1067                 memset(m, 0, sizeof(*m));
1068         } else {
1069                 m->alive        = true;
1070                 m->sectors      = le16_to_cpu(new_s->sectors);
1071                 m->algorithm    = new_s->algorithm;
1072                 m->nr_blocks    = new_s->nr_blocks;
1073                 m->nr_redundant = new_s->nr_redundant;
1074                 m->blocks_nonempty = 0;
1075
1076                 for (i = 0; i < new_s->nr_blocks; i++) {
1077                         m->block_sectors[i] =
1078                                 stripe_blockcount_get(new_s, i);
1079                         m->blocks_nonempty += !!m->block_sectors[i];
1080
1081                         m->ptrs[i] = new_s->ptrs[i];
1082                 }
1083
1084                 bch2_bkey_to_replicas(&m->r.e, new);
1085
1086                 if (!gc) {
1087                         spin_lock(&c->ec_stripes_heap_lock);
1088                         bch2_stripes_heap_update(c, m, idx);
1089                         spin_unlock(&c->ec_stripes_heap_lock);
1090                 }
1091         }
1092
1093         if (gc) {
1094                 /*
1095                  * gc recalculates this field from stripe ptr
1096                  * references:
1097                  */
1098                 memset(m->block_sectors, 0, sizeof(m->block_sectors));
1099                 m->blocks_nonempty = 0;
1100
1101                 for (i = 0; i < new_s->nr_blocks; i++) {
1102                         ret = mark_stripe_bucket(c, new, i, fs_usage,
1103                                                  journal_seq, flags);
1104                         if (ret)
1105                                 return ret;
1106                 }
1107
1108                 update_replicas(c, fs_usage, &m->r.e,
1109                                 ((s64) m->sectors * m->nr_redundant));
1110         }
1111
1112         return 0;
1113 }
1114
1115 static int bch2_mark_key_locked(struct bch_fs *c,
1116                    struct bkey_s_c old,
1117                    struct bkey_s_c new,
1118                    unsigned offset, s64 sectors,
1119                    struct bch_fs_usage *fs_usage,
1120                    u64 journal_seq, unsigned flags)
1121 {
1122         struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1123         int ret = 0;
1124
1125         BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1126
1127         preempt_disable();
1128
1129         if (!fs_usage || (flags & BTREE_TRIGGER_GC))
1130                 fs_usage = fs_usage_ptr(c, journal_seq,
1131                                         flags & BTREE_TRIGGER_GC);
1132
1133         switch (k.k->type) {
1134         case KEY_TYPE_alloc:
1135         case KEY_TYPE_alloc_v2:
1136                 ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
1137                 break;
1138         case KEY_TYPE_btree_ptr:
1139         case KEY_TYPE_btree_ptr_v2:
1140                 sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
1141                         ?  c->opts.btree_node_size
1142                         : -c->opts.btree_node_size;
1143
1144                 ret = bch2_mark_extent(c, old, new, offset, sectors,
1145                                 BCH_DATA_btree, fs_usage, journal_seq, flags);
1146                 break;
1147         case KEY_TYPE_extent:
1148         case KEY_TYPE_reflink_v:
1149                 ret = bch2_mark_extent(c, old, new, offset, sectors,
1150                                 BCH_DATA_user, fs_usage, journal_seq, flags);
1151                 break;
1152         case KEY_TYPE_stripe:
1153                 ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
1154                 break;
1155         case KEY_TYPE_inode:
1156                 fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
1157                 fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
1158                 break;
1159         case KEY_TYPE_reservation: {
1160                 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1161
1162                 sectors *= replicas;
1163                 replicas = clamp_t(unsigned, replicas, 1,
1164                                    ARRAY_SIZE(fs_usage->persistent_reserved));
1165
1166                 fs_usage->reserved                              += sectors;
1167                 fs_usage->persistent_reserved[replicas - 1]     += sectors;
1168                 break;
1169         }
1170         }
1171
1172         preempt_enable();
1173
1174         return ret;
1175 }
1176
1177 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
1178                   unsigned offset, s64 sectors,
1179                   struct bch_fs_usage *fs_usage,
1180                   u64 journal_seq, unsigned flags)
1181 {
1182         struct bkey deleted;
1183         struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
1184         int ret;
1185
1186         bkey_init(&deleted);
1187
1188         percpu_down_read(&c->mark_lock);
1189         ret = bch2_mark_key_locked(c, old, new, offset, sectors,
1190                                    fs_usage, journal_seq,
1191                                    BTREE_TRIGGER_INSERT|flags);
1192         percpu_up_read(&c->mark_lock);
1193
1194         return ret;
1195 }
1196
1197 int bch2_mark_update(struct btree_trans *trans,
1198                      struct btree_iter *iter,
1199                      struct bkey_i *new,
1200                      struct bch_fs_usage *fs_usage,
1201                      unsigned flags)
1202 {
1203         struct bch_fs           *c = trans->c;
1204         struct bkey_s_c         old;
1205         struct bkey             unpacked;
1206         int ret = 0;
1207
1208         if (unlikely(flags & BTREE_TRIGGER_NORUN))
1209                 return 0;
1210
1211         if (!btree_node_type_needs_gc(iter->btree_id))
1212                 return 0;
1213
1214         bkey_init(&unpacked);
1215         old = (struct bkey_s_c) { &unpacked, NULL };
1216
1217         if (!btree_node_type_is_extents(iter->btree_id)) {
1218                 /* iterators should be uptodate, shouldn't get errors here: */
1219                 if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
1220                         old = bch2_btree_iter_peek_slot(iter);
1221                         BUG_ON(bkey_err(old));
1222                 } else {
1223                         struct bkey_cached *ck = (void *) iter->l[0].b;
1224
1225                         if (ck->valid)
1226                                 old = bkey_i_to_s_c(ck->k);
1227                 }
1228
1229                 if (old.k->type == new->k.type) {
1230                         bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
1231                                 fs_usage, trans->journal_res.seq,
1232                                 BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1233
1234                 } else {
1235                         bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
1236                                 fs_usage, trans->journal_res.seq,
1237                                 BTREE_TRIGGER_INSERT|flags);
1238                         bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
1239                                 fs_usage, trans->journal_res.seq,
1240                                 BTREE_TRIGGER_OVERWRITE|flags);
1241                 }
1242         } else {
1243                 struct btree_iter *copy;
1244
1245                 BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
1246                 bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
1247                         0, new->k.size,
1248                         fs_usage, trans->journal_res.seq,
1249                         BTREE_TRIGGER_INSERT|flags);
1250
1251                 copy = bch2_trans_copy_iter(trans, iter);
1252
1253                 for_each_btree_key_continue(copy, 0, old, ret) {
1254                         unsigned offset = 0;
1255                         s64 sectors = -((s64) old.k->size);
1256
1257                         flags |= BTREE_TRIGGER_OVERWRITE;
1258
1259                         if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
1260                                 break;
1261
1262                         switch (bch2_extent_overlap(&new->k, old.k)) {
1263                         case BCH_EXTENT_OVERLAP_ALL:
1264                                 offset = 0;
1265                                 sectors = -((s64) old.k->size);
1266                                 break;
1267                         case BCH_EXTENT_OVERLAP_BACK:
1268                                 offset = bkey_start_offset(&new->k) -
1269                                         bkey_start_offset(old.k);
1270                                 sectors = bkey_start_offset(&new->k) -
1271                                         old.k->p.offset;
1272                                 break;
1273                         case BCH_EXTENT_OVERLAP_FRONT:
1274                                 offset = 0;
1275                                 sectors = bkey_start_offset(old.k) -
1276                                         new->k.p.offset;
1277                                 break;
1278                         case BCH_EXTENT_OVERLAP_MIDDLE:
1279                                 offset = bkey_start_offset(&new->k) -
1280                                         bkey_start_offset(old.k);
1281                                 sectors = -((s64) new->k.size);
1282                                 flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
1283                                 break;
1284                         }
1285
1286                         BUG_ON(sectors >= 0);
1287
1288                         ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
1289                                         offset, sectors, fs_usage,
1290                                         trans->journal_res.seq, flags) ?: 1;
1291                         if (ret <= 0)
1292                                 break;
1293                 }
1294                 bch2_trans_iter_put(trans, copy);
1295         }
1296
1297         return ret;
1298 }
1299
1300 static noinline __cold
1301 void fs_usage_apply_warn(struct btree_trans *trans,
1302                          unsigned disk_res_sectors)
1303 {
1304         struct bch_fs *c = trans->c;
1305         struct btree_insert_entry *i;
1306         char buf[200];
1307
1308         bch_err(c, "disk usage increased more than %u sectors reserved",
1309                 disk_res_sectors);
1310
1311         trans_for_each_update(trans, i) {
1312                 pr_err("while inserting");
1313                 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
1314                 pr_err("%s", buf);
1315                 pr_err("overlapping with");
1316
1317                 if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
1318                         struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
1319                         struct bkey_s_c k;
1320                         int ret;
1321
1322                         for_each_btree_key_continue(copy, 0, k, ret) {
1323                                 if (btree_node_type_is_extents(i->iter->btree_id)
1324                                     ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
1325                                     : bkey_cmp(i->k->k.p, k.k->p))
1326                                         break;
1327
1328                                 bch2_bkey_val_to_text(&PBUF(buf), c, k);
1329                                 pr_err("%s", buf);
1330                         }
1331                         bch2_trans_iter_put(trans, copy);
1332                 } else {
1333                         struct bkey_cached *ck = (void *) i->iter->l[0].b;
1334
1335                         if (ck->valid) {
1336                                 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
1337                                 pr_err("%s", buf);
1338                         }
1339                 }
1340         }
1341 }
1342
1343 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
1344                                struct replicas_delta_list *deltas)
1345 {
1346         struct bch_fs *c = trans->c;
1347         static int warned_disk_usage = 0;
1348         bool warn = false;
1349         unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
1350         struct replicas_delta *d = deltas->d;
1351         struct replicas_delta *top = (void *) deltas->d + deltas->used;
1352         struct bch_fs_usage *dst;
1353         s64 added = 0, should_not_have_added;
1354         unsigned i;
1355
1356         percpu_rwsem_assert_held(&c->mark_lock);
1357
1358         preempt_disable();
1359         dst = fs_usage_ptr(c, trans->journal_res.seq, false);
1360
1361         for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
1362                 switch (d->r.data_type) {
1363                 case BCH_DATA_btree:
1364                 case BCH_DATA_user:
1365                 case BCH_DATA_parity:
1366                         added += d->delta;
1367                 }
1368
1369                 update_replicas(c, dst, &d->r, d->delta);
1370         }
1371
1372         dst->nr_inodes += deltas->nr_inodes;
1373
1374         for (i = 0; i < BCH_REPLICAS_MAX; i++) {
1375                 added                           += deltas->persistent_reserved[i];
1376                 dst->reserved                   += deltas->persistent_reserved[i];
1377                 dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
1378         }
1379
1380         /*
1381          * Not allowed to reduce sectors_available except by getting a
1382          * reservation:
1383          */
1384         should_not_have_added = added - (s64) disk_res_sectors;
1385         if (unlikely(should_not_have_added > 0)) {
1386                 atomic64_sub(should_not_have_added, &c->sectors_available);
1387                 added -= should_not_have_added;
1388                 warn = true;
1389         }
1390
1391         if (added > 0) {
1392                 trans->disk_res->sectors -= added;
1393                 this_cpu_sub(*c->online_reserved, added);
1394         }
1395
1396         preempt_enable();
1397
1398         if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
1399                 fs_usage_apply_warn(trans, disk_res_sectors);
1400 }
1401
1402 /* trans_mark: */
1403
1404 static struct btree_iter *trans_get_update(struct btree_trans *trans,
1405                             enum btree_id btree_id, struct bpos pos,
1406                             struct bkey_s_c *k)
1407 {
1408         struct btree_insert_entry *i;
1409
1410         trans_for_each_update(trans, i)
1411                 if (i->iter->btree_id == btree_id &&
1412                     (btree_node_type_is_extents(btree_id)
1413                      ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
1414                        bkey_cmp(pos, i->k->k.p) < 0
1415                      : !bkey_cmp(pos, i->iter->pos))) {
1416                         *k = bkey_i_to_s_c(i->k);
1417
1418                         /* ugly hack.. */
1419                         BUG_ON(btree_iter_live(trans, i->iter));
1420                         trans->iters_live |= 1ULL << i->iter->idx;
1421                         return i->iter;
1422                 }
1423
1424         return NULL;
1425 }
1426
1427 static int trans_get_key(struct btree_trans *trans,
1428                          enum btree_id btree_id, struct bpos pos,
1429                          struct btree_iter **iter,
1430                          struct bkey_s_c *k)
1431 {
1432         unsigned flags = btree_id != BTREE_ID_alloc
1433                 ? BTREE_ITER_SLOTS
1434                 : BTREE_ITER_CACHED;
1435         int ret;
1436
1437         *iter = trans_get_update(trans, btree_id, pos, k);
1438         if (*iter)
1439                 return 1;
1440
1441         *iter = bch2_trans_get_iter(trans, btree_id, pos,
1442                                     flags|BTREE_ITER_INTENT);
1443         *k = __bch2_btree_iter_peek(*iter, flags);
1444         ret = bkey_err(*k);
1445         if (ret)
1446                 bch2_trans_iter_put(trans, *iter);
1447         return ret;
1448 }
1449
1450 static struct bkey_alloc_buf *
1451 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
1452                               const struct bch_extent_ptr *ptr,
1453                               struct bkey_alloc_unpacked *u)
1454 {
1455         struct bch_fs *c = trans->c;
1456         struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1457         struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
1458         struct bucket *g;
1459         struct btree_iter *iter;
1460         struct bkey_s_c k;
1461         struct bkey_alloc_buf *a;
1462         int ret;
1463
1464         a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
1465         if (IS_ERR(a))
1466                 return a;
1467
1468         iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
1469         if (iter) {
1470                 *u = bch2_alloc_unpack(k);
1471         } else {
1472                 iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
1473                                            BTREE_ITER_CACHED|
1474                                            BTREE_ITER_CACHED_NOFILL|
1475                                            BTREE_ITER_INTENT);
1476                 ret = bch2_btree_iter_traverse(iter);
1477                 if (ret) {
1478                         bch2_trans_iter_put(trans, iter);
1479                         return ERR_PTR(ret);
1480                 }
1481
1482                 percpu_down_read(&c->mark_lock);
1483                 g = bucket(ca, pos.offset);
1484                 *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
1485                 percpu_up_read(&c->mark_lock);
1486         }
1487
1488         *_iter = iter;
1489         return a;
1490 }
1491
1492 static int bch2_trans_mark_pointer(struct btree_trans *trans,
1493                         struct bkey_s_c k, struct extent_ptr_decoded p,
1494                         s64 sectors, enum bch_data_type data_type)
1495 {
1496         struct bch_fs *c = trans->c;
1497         struct btree_iter *iter;
1498         struct bkey_alloc_unpacked u;
1499         struct bkey_alloc_buf *a;
1500         int ret;
1501
1502         a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
1503         if (IS_ERR(a))
1504                 return PTR_ERR(a);
1505
1506         ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
1507                              &u.dirty_sectors, &u.cached_sectors);
1508         if (ret)
1509                 goto out;
1510
1511         bch2_alloc_pack(c, a, u);
1512         bch2_trans_update(trans, iter, &a->k, 0);
1513 out:
1514         bch2_trans_iter_put(trans, iter);
1515         return ret;
1516 }
1517
1518 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
1519                         struct extent_ptr_decoded p,
1520                         s64 sectors, enum bch_data_type data_type)
1521 {
1522         struct bch_fs *c = trans->c;
1523         struct btree_iter *iter;
1524         struct bkey_s_c k;
1525         struct bkey_i_stripe *s;
1526         struct bch_replicas_padded r;
1527         int ret = 0;
1528
1529         ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
1530         if (ret < 0)
1531                 return ret;
1532
1533         if (k.k->type != KEY_TYPE_stripe) {
1534                 bch2_fs_inconsistent(c,
1535                         "pointer to nonexistent stripe %llu",
1536                         (u64) p.ec.idx);
1537                 ret = -EIO;
1538                 goto out;
1539         }
1540
1541         if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
1542                 bch2_fs_inconsistent(c,
1543                         "stripe pointer doesn't match stripe %llu",
1544                         (u64) p.ec.idx);
1545                 ret = -EIO;
1546                 goto out;
1547         }
1548
1549         s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1550         ret = PTR_ERR_OR_ZERO(s);
1551         if (ret)
1552                 goto out;
1553
1554         bkey_reassemble(&s->k_i, k);
1555         stripe_blockcount_set(&s->v, p.ec.block,
1556                 stripe_blockcount_get(&s->v, p.ec.block) +
1557                 sectors);
1558         bch2_trans_update(trans, iter, &s->k_i, 0);
1559
1560         bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
1561         r.e.data_type = data_type;
1562         update_replicas_list(trans, &r.e, sectors);
1563 out:
1564         bch2_trans_iter_put(trans, iter);
1565         return ret;
1566 }
1567
1568 static int bch2_trans_mark_extent(struct btree_trans *trans,
1569                         struct bkey_s_c k, unsigned offset,
1570                         s64 sectors, unsigned flags,
1571                         enum bch_data_type data_type)
1572 {
1573         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1574         const union bch_extent_entry *entry;
1575         struct extent_ptr_decoded p;
1576         struct bch_replicas_padded r;
1577         s64 dirty_sectors = 0;
1578         bool stale;
1579         int ret;
1580
1581         r.e.data_type   = data_type;
1582         r.e.nr_devs     = 0;
1583         r.e.nr_required = 1;
1584
1585         BUG_ON(!sectors);
1586
1587         bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1588                 s64 disk_sectors = data_type == BCH_DATA_btree
1589                         ? sectors
1590                         : ptr_disk_sectors_delta(p, offset, sectors, flags);
1591
1592                 ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors,
1593                                               data_type);
1594                 if (ret < 0)
1595                         return ret;
1596
1597                 stale = ret > 0;
1598
1599                 if (p.ptr.cached) {
1600                         if (!stale)
1601                                 update_cached_sectors_list(trans, p.ptr.dev,
1602                                                            disk_sectors);
1603                 } else if (!p.has_ec) {
1604                         dirty_sectors          += disk_sectors;
1605                         r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1606                 } else {
1607                         ret = bch2_trans_mark_stripe_ptr(trans, p,
1608                                         disk_sectors, data_type);
1609                         if (ret)
1610                                 return ret;
1611
1612                         r.e.nr_required = 0;
1613                 }
1614         }
1615
1616         if (r.e.nr_devs)
1617                 update_replicas_list(trans, &r.e, dirty_sectors);
1618
1619         return 0;
1620 }
1621
1622 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
1623                                             struct bkey_s_c_stripe s,
1624                                             unsigned idx, bool deleting)
1625 {
1626         struct bch_fs *c = trans->c;
1627         const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
1628         struct bkey_alloc_buf *a;
1629         struct btree_iter *iter;
1630         struct bkey_alloc_unpacked u;
1631         bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
1632         int ret = 0;
1633
1634         a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
1635         if (IS_ERR(a))
1636                 return PTR_ERR(a);
1637
1638         if (parity) {
1639                 s64 sectors = le16_to_cpu(s.v->sectors);
1640
1641                 if (deleting)
1642                         sectors = -sectors;
1643
1644                 u.dirty_sectors += sectors;
1645                 u.data_type = u.dirty_sectors
1646                         ? BCH_DATA_parity
1647                         : 0;
1648         }
1649
1650         if (!deleting) {
1651                 if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
1652                                 "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
1653                                 iter->pos.inode, iter->pos.offset, u.gen,
1654                                 u.stripe, s.k->p.offset)) {
1655                         ret = -EIO;
1656                         goto err;
1657                 }
1658
1659                 u.stripe                = s.k->p.offset;
1660                 u.stripe_redundancy     = s.v->nr_redundant;
1661         } else {
1662                 u.stripe                = 0;
1663                 u.stripe_redundancy     = 0;
1664         }
1665
1666         bch2_alloc_pack(c, a, u);
1667         bch2_trans_update(trans, iter, &a->k, 0);
1668 err:
1669         bch2_trans_iter_put(trans, iter);
1670         return ret;
1671 }
1672
1673 static int bch2_trans_mark_stripe(struct btree_trans *trans,
1674                                   struct bkey_s_c old, struct bkey_s_c new,
1675                                   unsigned flags)
1676 {
1677         struct bkey_s_c_stripe old_s = { NULL };
1678         struct bkey_s_c_stripe new_s = { NULL };
1679         struct bch_replicas_padded r;
1680         unsigned i;
1681         int ret = 0;
1682
1683         if (old.k->type == KEY_TYPE_stripe)
1684                 old_s = bkey_s_c_to_stripe(old);
1685         if (new.k->type == KEY_TYPE_stripe)
1686                 new_s = bkey_s_c_to_stripe(new);
1687
1688         /*
1689          * If the pointers aren't changing, we don't need to do anything:
1690          */
1691         if (new_s.k && old_s.k &&
1692             new_s.v->nr_blocks          == old_s.v->nr_blocks &&
1693             new_s.v->nr_redundant       == old_s.v->nr_redundant &&
1694             !memcmp(old_s.v->ptrs, new_s.v->ptrs,
1695                     new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
1696                 return 0;
1697
1698         if (new_s.k) {
1699                 s64 sectors = le16_to_cpu(new_s.v->sectors);
1700
1701                 bch2_bkey_to_replicas(&r.e, new);
1702                 update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
1703
1704                 for (i = 0; i < new_s.v->nr_blocks; i++) {
1705                         ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
1706                                                                i, false);
1707                         if (ret)
1708                                 return ret;
1709                 }
1710         }
1711
1712         if (old_s.k) {
1713                 s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
1714
1715                 bch2_bkey_to_replicas(&r.e, old);
1716                 update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
1717
1718                 for (i = 0; i < old_s.v->nr_blocks; i++) {
1719                         ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
1720                                                                i, true);
1721                         if (ret)
1722                                 return ret;
1723                 }
1724         }
1725
1726         return ret;
1727 }
1728
1729 static __le64 *bkey_refcount(struct bkey_i *k)
1730 {
1731         switch (k->k.type) {
1732         case KEY_TYPE_reflink_v:
1733                 return &bkey_i_to_reflink_v(k)->v.refcount;
1734         case KEY_TYPE_indirect_inline_data:
1735                 return &bkey_i_to_indirect_inline_data(k)->v.refcount;
1736         default:
1737                 return NULL;
1738         }
1739 }
1740
1741 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
1742                         struct bkey_s_c_reflink_p p,
1743                         u64 idx, unsigned sectors,
1744                         unsigned flags)
1745 {
1746         struct bch_fs *c = trans->c;
1747         struct btree_iter *iter;
1748         struct bkey_s_c k;
1749         struct bkey_i *n;
1750         __le64 *refcount;
1751         s64 ret;
1752
1753         ret = trans_get_key(trans, BTREE_ID_reflink,
1754                             POS(0, idx), &iter, &k);
1755         if (ret < 0)
1756                 return ret;
1757
1758         if ((flags & BTREE_TRIGGER_OVERWRITE) &&
1759             (bkey_start_offset(k.k) < idx ||
1760              k.k->p.offset > idx + sectors))
1761                 goto out;
1762
1763         sectors = k.k->p.offset - idx;
1764
1765         n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1766         ret = PTR_ERR_OR_ZERO(n);
1767         if (ret)
1768                 goto err;
1769
1770         bkey_reassemble(n, k);
1771
1772         refcount = bkey_refcount(n);
1773         if (!refcount) {
1774                 bch2_fs_inconsistent(c,
1775                         "%llu:%llu len %u points to nonexistent indirect extent %llu",
1776                         p.k->p.inode, p.k->p.offset, p.k->size, idx);
1777                 ret = -EIO;
1778                 goto err;
1779         }
1780
1781         le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
1782
1783         if (!*refcount) {
1784                 n->k.type = KEY_TYPE_deleted;
1785                 set_bkey_val_u64s(&n->k, 0);
1786         }
1787
1788         bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
1789         bch2_trans_update(trans, iter, n, 0);
1790 out:
1791         ret = sectors;
1792 err:
1793         bch2_trans_iter_put(trans, iter);
1794         return ret;
1795 }
1796
1797 static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
1798                         struct bkey_s_c_reflink_p p, unsigned offset,
1799                         s64 sectors, unsigned flags)
1800 {
1801         u64 idx = le64_to_cpu(p.v->idx) + offset;
1802         s64 ret = 0;
1803
1804         sectors = abs(sectors);
1805         BUG_ON(offset + sectors > p.k->size);
1806
1807         while (sectors) {
1808                 ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
1809                 if (ret < 0)
1810                         break;
1811
1812                 idx += ret;
1813                 sectors = max_t(s64, 0LL, sectors - ret);
1814                 ret = 0;
1815         }
1816
1817         return ret;
1818 }
1819
1820 int bch2_trans_mark_key(struct btree_trans *trans,
1821                         struct bkey_s_c old,
1822                         struct bkey_s_c new,
1823                         unsigned offset, s64 sectors, unsigned flags)
1824 {
1825         struct bch_fs *c = trans->c;
1826         struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1827         struct replicas_delta_list *d;
1828
1829         BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1830
1831         switch (k.k->type) {
1832         case KEY_TYPE_btree_ptr:
1833         case KEY_TYPE_btree_ptr_v2:
1834                 sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
1835                         ?  c->opts.btree_node_size
1836                         : -c->opts.btree_node_size;
1837
1838                 return bch2_trans_mark_extent(trans, k, offset, sectors,
1839                                               flags, BCH_DATA_btree);
1840         case KEY_TYPE_extent:
1841         case KEY_TYPE_reflink_v:
1842                 return bch2_trans_mark_extent(trans, k, offset, sectors,
1843                                               flags, BCH_DATA_user);
1844         case KEY_TYPE_stripe:
1845                 return bch2_trans_mark_stripe(trans, old, new, flags);
1846         case KEY_TYPE_inode: {
1847                 int nr = (new.k->type == KEY_TYPE_inode) -
1848                          (old.k->type == KEY_TYPE_inode);
1849
1850                 if (nr) {
1851                         d = replicas_deltas_realloc(trans, 0);
1852                         d->nr_inodes += nr;
1853                 }
1854
1855                 return 0;
1856         }
1857         case KEY_TYPE_reservation: {
1858                 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1859
1860                 d = replicas_deltas_realloc(trans, 0);
1861
1862                 sectors *= replicas;
1863                 replicas = clamp_t(unsigned, replicas, 1,
1864                                    ARRAY_SIZE(d->persistent_reserved));
1865
1866                 d->persistent_reserved[replicas - 1] += sectors;
1867                 return 0;
1868         }
1869         case KEY_TYPE_reflink_p:
1870                 return bch2_trans_mark_reflink_p(trans,
1871                                         bkey_s_c_to_reflink_p(k),
1872                                         offset, sectors, flags);
1873         default:
1874                 return 0;
1875         }
1876 }
1877
1878 int bch2_trans_mark_update(struct btree_trans *trans,
1879                            struct btree_iter *iter,
1880                            struct bkey_i *new,
1881                            unsigned flags)
1882 {
1883         struct bkey_s_c old;
1884         int ret;
1885
1886         if (unlikely(flags & BTREE_TRIGGER_NORUN))
1887                 return 0;
1888
1889         if (!btree_node_type_needs_gc(iter->btree_id))
1890                 return 0;
1891
1892         if (!btree_node_type_is_extents(iter->btree_id)) {
1893                 /* iterators should be uptodate, shouldn't get errors here: */
1894                 if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
1895                         old = bch2_btree_iter_peek_slot(iter);
1896                         BUG_ON(bkey_err(old));
1897                 } else {
1898                         struct bkey_cached *ck = (void *) iter->l[0].b;
1899
1900                         BUG_ON(!ck->valid);
1901                         old = bkey_i_to_s_c(ck->k);
1902                 }
1903
1904                 if (old.k->type == new->k.type) {
1905                         ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
1906                                         BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1907                 } else {
1908                         ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
1909                                         BTREE_TRIGGER_INSERT|flags) ?:
1910                                 bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
1911                                         BTREE_TRIGGER_OVERWRITE|flags);
1912                 }
1913         } else {
1914                 struct btree_iter *copy;
1915                 struct bkey _old;
1916
1917                 EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
1918
1919                 bkey_init(&_old);
1920                 old = (struct bkey_s_c) { &_old, NULL };
1921
1922                 ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
1923                                           0, new->k.size,
1924                                           BTREE_TRIGGER_INSERT);
1925                 if (ret)
1926                         return ret;
1927
1928                 copy = bch2_trans_copy_iter(trans, iter);
1929
1930                 for_each_btree_key_continue(copy, 0, old, ret) {
1931                         unsigned offset = 0;
1932                         s64 sectors = -((s64) old.k->size);
1933
1934                         flags |= BTREE_TRIGGER_OVERWRITE;
1935
1936                         if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
1937                                 break;
1938
1939                         switch (bch2_extent_overlap(&new->k, old.k)) {
1940                         case BCH_EXTENT_OVERLAP_ALL:
1941                                 offset = 0;
1942                                 sectors = -((s64) old.k->size);
1943                                 break;
1944                         case BCH_EXTENT_OVERLAP_BACK:
1945                                 offset = bkey_start_offset(&new->k) -
1946                                         bkey_start_offset(old.k);
1947                                 sectors = bkey_start_offset(&new->k) -
1948                                         old.k->p.offset;
1949                                 break;
1950                         case BCH_EXTENT_OVERLAP_FRONT:
1951                                 offset = 0;
1952                                 sectors = bkey_start_offset(old.k) -
1953                                         new->k.p.offset;
1954                                 break;
1955                         case BCH_EXTENT_OVERLAP_MIDDLE:
1956                                 offset = bkey_start_offset(&new->k) -
1957                                         bkey_start_offset(old.k);
1958                                 sectors = -((s64) new->k.size);
1959                                 flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
1960                                 break;
1961                         }
1962
1963                         BUG_ON(sectors >= 0);
1964
1965                         ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
1966                                         offset, sectors, flags);
1967                         if (ret)
1968                                 break;
1969                 }
1970                 bch2_trans_iter_put(trans, copy);
1971         }
1972
1973         return ret;
1974 }
1975
1976 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1977                                     struct bch_dev *ca, size_t b,
1978                                     enum bch_data_type type,
1979                                     unsigned sectors)
1980 {
1981         struct bch_fs *c = trans->c;
1982         struct btree_iter *iter;
1983         struct bkey_alloc_unpacked u;
1984         struct bkey_alloc_buf *a;
1985         struct bch_extent_ptr ptr = {
1986                 .dev = ca->dev_idx,
1987                 .offset = bucket_to_sector(ca, b),
1988         };
1989         int ret = 0;
1990
1991         a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
1992         if (IS_ERR(a))
1993                 return PTR_ERR(a);
1994
1995         if (u.data_type && u.data_type != type) {
1996                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
1997                         "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
1998                         "while marking %s",
1999                         iter->pos.inode, iter->pos.offset, u.gen,
2000                         bch2_data_types[u.data_type],
2001                         bch2_data_types[type],
2002                         bch2_data_types[type]);
2003                 ret = -EIO;
2004                 goto out;
2005         }
2006
2007         if ((unsigned) (u.dirty_sectors + sectors) > ca->mi.bucket_size) {
2008                 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
2009                         "bucket %llu:%llu gen %u data type %s sector count overflow: %u + %u > %u\n"
2010                         "while marking %s",
2011                         iter->pos.inode, iter->pos.offset, u.gen,
2012                         bch2_data_types[u.data_type ?: type],
2013                         u.dirty_sectors, sectors, ca->mi.bucket_size,
2014                         bch2_data_types[type]);
2015                 ret = -EIO;
2016                 goto out;
2017         }
2018
2019         if (u.data_type         == type &&
2020             u.dirty_sectors     == sectors)
2021                 goto out;
2022
2023         u.data_type     = type;
2024         u.dirty_sectors = sectors;
2025
2026         bch2_alloc_pack(c, a, u);
2027         bch2_trans_update(trans, iter, &a->k, 0);
2028 out:
2029         bch2_trans_iter_put(trans, iter);
2030         return ret;
2031 }
2032
2033 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
2034                                     struct disk_reservation *res,
2035                                     struct bch_dev *ca, size_t b,
2036                                     enum bch_data_type type,
2037                                     unsigned sectors)
2038 {
2039         return __bch2_trans_do(trans, res, NULL, 0,
2040                         __bch2_trans_mark_metadata_bucket(trans, ca, b, BCH_DATA_journal,
2041                                                         ca->mi.bucket_size));
2042
2043 }
2044
2045 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
2046                                             struct disk_reservation *res,
2047                                             struct bch_dev *ca,
2048                                             u64 start, u64 end,
2049                                             enum bch_data_type type,
2050                                             u64 *bucket, unsigned *bucket_sectors)
2051 {
2052         int ret;
2053
2054         do {
2055                 u64 b = sector_to_bucket(ca, start);
2056                 unsigned sectors =
2057                         min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
2058
2059                 if (b != *bucket) {
2060                         if (*bucket_sectors) {
2061                                 ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
2062                                                 *bucket, type, *bucket_sectors);
2063                                 if (ret)
2064                                         return ret;
2065                         }
2066
2067                         *bucket         = b;
2068                         *bucket_sectors = 0;
2069                 }
2070
2071                 *bucket_sectors += sectors;
2072                 start += sectors;
2073         } while (!ret && start < end);
2074
2075         return 0;
2076 }
2077
2078 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
2079                              struct disk_reservation *res,
2080                              struct bch_dev *ca)
2081 {
2082         struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
2083         u64 bucket = 0;
2084         unsigned i, bucket_sectors = 0;
2085         int ret;
2086
2087         for (i = 0; i < layout->nr_superblocks; i++) {
2088                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
2089
2090                 if (offset == BCH_SB_SECTOR) {
2091                         ret = bch2_trans_mark_metadata_sectors(trans, res, ca,
2092                                                 0, BCH_SB_SECTOR,
2093                                                 BCH_DATA_sb, &bucket, &bucket_sectors);
2094                         if (ret)
2095                                 return ret;
2096                 }
2097
2098                 ret = bch2_trans_mark_metadata_sectors(trans, res, ca, offset,
2099                                       offset + (1 << layout->sb_max_size_bits),
2100                                       BCH_DATA_sb, &bucket, &bucket_sectors);
2101                 if (ret)
2102                         return ret;
2103         }
2104
2105         if (bucket_sectors) {
2106                 ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
2107                                 bucket, BCH_DATA_sb, bucket_sectors);
2108                 if (ret)
2109                         return ret;
2110         }
2111
2112         for (i = 0; i < ca->journal.nr; i++) {
2113                 ret = bch2_trans_mark_metadata_bucket(trans, res, ca,
2114                                 ca->journal.buckets[i],
2115                                 BCH_DATA_journal, ca->mi.bucket_size);
2116                 if (ret)
2117                         return ret;
2118         }
2119
2120         return 0;
2121 }
2122
2123 int bch2_trans_mark_dev_sb(struct bch_fs *c,
2124                            struct disk_reservation *res,
2125                            struct bch_dev *ca)
2126 {
2127         return bch2_trans_do(c, res, NULL, 0,
2128                         __bch2_trans_mark_dev_sb(&trans, res, ca));
2129 }
2130
2131 /* Disk reservations: */
2132
2133 #define SECTORS_CACHE   1024
2134
2135 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
2136                               u64 sectors, int flags)
2137 {
2138         struct bch_fs_pcpu *pcpu;
2139         u64 old, v, get;
2140         s64 sectors_available;
2141         int ret;
2142
2143         percpu_down_read(&c->mark_lock);
2144         preempt_disable();
2145         pcpu = this_cpu_ptr(c->pcpu);
2146
2147         if (sectors <= pcpu->sectors_available)
2148                 goto out;
2149
2150         v = atomic64_read(&c->sectors_available);
2151         do {
2152                 old = v;
2153                 get = min((u64) sectors + SECTORS_CACHE, old);
2154
2155                 if (get < sectors) {
2156                         preempt_enable();
2157                         goto recalculate;
2158                 }
2159         } while ((v = atomic64_cmpxchg(&c->sectors_available,
2160                                        old, old - get)) != old);
2161
2162         pcpu->sectors_available         += get;
2163
2164 out:
2165         pcpu->sectors_available         -= sectors;
2166         this_cpu_add(*c->online_reserved, sectors);
2167         res->sectors                    += sectors;
2168
2169         preempt_enable();
2170         percpu_up_read(&c->mark_lock);
2171         return 0;
2172
2173 recalculate:
2174         mutex_lock(&c->sectors_available_lock);
2175
2176         percpu_u64_set(&c->pcpu->sectors_available, 0);
2177         sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
2178
2179         if (sectors <= sectors_available ||
2180             (flags & BCH_DISK_RESERVATION_NOFAIL)) {
2181                 atomic64_set(&c->sectors_available,
2182                              max_t(s64, 0, sectors_available - sectors));
2183                 this_cpu_add(*c->online_reserved, sectors);
2184                 res->sectors                    += sectors;
2185                 ret = 0;
2186         } else {
2187                 atomic64_set(&c->sectors_available, sectors_available);
2188                 ret = -ENOSPC;
2189         }
2190
2191         mutex_unlock(&c->sectors_available_lock);
2192         percpu_up_read(&c->mark_lock);
2193
2194         return ret;
2195 }
2196
2197 /* Startup/shutdown: */
2198
2199 static void buckets_free_rcu(struct rcu_head *rcu)
2200 {
2201         struct bucket_array *buckets =
2202                 container_of(rcu, struct bucket_array, rcu);
2203
2204         kvpfree(buckets,
2205                 sizeof(struct bucket_array) +
2206                 buckets->nbuckets * sizeof(struct bucket));
2207 }
2208
2209 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
2210 {
2211         struct bucket_array *buckets = NULL, *old_buckets = NULL;
2212         unsigned long *buckets_nouse = NULL;
2213         alloc_fifo      free[RESERVE_NR];
2214         alloc_fifo      free_inc;
2215         alloc_heap      alloc_heap;
2216
2217         size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
2218                              ca->mi.bucket_size / c->opts.btree_node_size);
2219         /* XXX: these should be tunable */
2220         size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
2221         size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
2222         size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
2223                                       btree_reserve * 2);
2224         bool resize = ca->buckets[0] != NULL;
2225         int ret = -ENOMEM;
2226         unsigned i;
2227
2228         memset(&free,           0, sizeof(free));
2229         memset(&free_inc,       0, sizeof(free_inc));
2230         memset(&alloc_heap,     0, sizeof(alloc_heap));
2231
2232         if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
2233                                             nbuckets * sizeof(struct bucket),
2234                                             GFP_KERNEL|__GFP_ZERO)) ||
2235             !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
2236                                             sizeof(unsigned long),
2237                                             GFP_KERNEL|__GFP_ZERO)) ||
2238             !init_fifo(&free[RESERVE_MOVINGGC],
2239                        copygc_reserve, GFP_KERNEL) ||
2240             !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
2241             !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
2242             !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
2243                 goto err;
2244
2245         buckets->first_bucket   = ca->mi.first_bucket;
2246         buckets->nbuckets       = nbuckets;
2247
2248         bch2_copygc_stop(c);
2249
2250         if (resize) {
2251                 down_write(&c->gc_lock);
2252                 down_write(&ca->bucket_lock);
2253                 percpu_down_write(&c->mark_lock);
2254         }
2255
2256         old_buckets = bucket_array(ca);
2257
2258         if (resize) {
2259                 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
2260
2261                 memcpy(buckets->b,
2262                        old_buckets->b,
2263                        n * sizeof(struct bucket));
2264                 memcpy(buckets_nouse,
2265                        ca->buckets_nouse,
2266                        BITS_TO_LONGS(n) * sizeof(unsigned long));
2267         }
2268
2269         rcu_assign_pointer(ca->buckets[0], buckets);
2270         buckets = old_buckets;
2271
2272         swap(ca->buckets_nouse, buckets_nouse);
2273
2274         if (resize) {
2275                 percpu_up_write(&c->mark_lock);
2276                 up_write(&c->gc_lock);
2277         }
2278
2279         spin_lock(&c->freelist_lock);
2280         for (i = 0; i < RESERVE_NR; i++) {
2281                 fifo_move(&free[i], &ca->free[i]);
2282                 swap(ca->free[i], free[i]);
2283         }
2284         fifo_move(&free_inc, &ca->free_inc);
2285         swap(ca->free_inc, free_inc);
2286         spin_unlock(&c->freelist_lock);
2287
2288         /* with gc lock held, alloc_heap can't be in use: */
2289         swap(ca->alloc_heap, alloc_heap);
2290
2291         nbuckets = ca->mi.nbuckets;
2292
2293         if (resize)
2294                 up_write(&ca->bucket_lock);
2295
2296         ret = 0;
2297 err:
2298         free_heap(&alloc_heap);
2299         free_fifo(&free_inc);
2300         for (i = 0; i < RESERVE_NR; i++)
2301                 free_fifo(&free[i]);
2302         kvpfree(buckets_nouse,
2303                 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
2304         if (buckets)
2305                 call_rcu(&old_buckets->rcu, buckets_free_rcu);
2306
2307         return ret;
2308 }
2309
2310 void bch2_dev_buckets_free(struct bch_dev *ca)
2311 {
2312         unsigned i;
2313
2314         free_heap(&ca->alloc_heap);
2315         free_fifo(&ca->free_inc);
2316         for (i = 0; i < RESERVE_NR; i++)
2317                 free_fifo(&ca->free[i]);
2318         kvpfree(ca->buckets_nouse,
2319                 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
2320         kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
2321                 sizeof(struct bucket_array) +
2322                 ca->mi.nbuckets * sizeof(struct bucket));
2323
2324         for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
2325                 free_percpu(ca->usage[i]);
2326         kfree(ca->usage_base);
2327 }
2328
2329 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
2330 {
2331         unsigned i;
2332
2333         ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
2334         if (!ca->usage_base)
2335                 return -ENOMEM;
2336
2337         for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
2338                 ca->usage[i] = alloc_percpu(struct bch_dev_usage);
2339                 if (!ca->usage[i])
2340                         return -ENOMEM;
2341         }
2342
2343         return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
2344 }