]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 15f6e66e86 bcachefs: pass around bset_tree less
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 15 Aug 2018 23:41:24 +0000 (19:41 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 16 Aug 2018 00:00:21 +0000 (20:00 -0400)
64 files changed:
.bcachefs_revision
cmd_debug.c
cmd_migrate.c
include/linux/kernel.h
include/linux/log2.h
include/linux/sched.h
include/linux/time64.h
include/trace/events/bcachefs.h
libbcachefs/acl.c
libbcachefs/alloc.c
libbcachefs/alloc.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fifo.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs-ioctl.h
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/quota.h
libbcachefs/recovery.c
libbcachefs/str_hash.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/xattr.c
libbcachefs/xattr.h

index dddb04437eb6f5c534372f169fbe4c4149bbd346..300e92845481f84f22845bc7f469ae26b531f651 100644 (file)
@@ -1 +1 @@
-eab3b355cf6fcabbf07d7a9032c68e95cab37ad0
+15f6e66e86a97245d967fedcb2f33598c174fd96
index 11d73b35b6eba653242448154317c1198ff71d65..51099f1a6dc5c487b396d51560476897dcd9d361 100644 (file)
@@ -204,9 +204,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id,
 
                buf[0] = '\t';
 
-               for_each_btree_node_key_unpack(b, k, &node_iter,
-                                              btree_node_is_extents(b),
-                                              &unpacked) {
+               for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) {
                        bch2_bkey_val_to_text(c, bkey_type(0, btree_id),
                                              buf + 1, sizeof(buf) - 1, k);
                        puts(buf);
index 44283c3cc3f84d5871d7cca00fe577dc32ed3bb9..177884da197b141ab76634fcb2fef9107aab2892 100644 (file)
@@ -121,7 +121,7 @@ static void update_inode(struct bch_fs *c,
 
        bch2_inode_pack(&packed, inode);
        ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-                               NULL, NULL, NULL, 0);
+                               NULL, NULL, 0);
        if (ret)
                die("error creating file: %s", strerror(-ret));
 }
@@ -350,7 +350,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                                        extent_i_to_s_c(e).s_c);
 
                ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
-                                       &res, NULL, NULL, 0);
+                                       &res, NULL, 0);
                if (ret)
                        die("btree insert error %s", strerror(-ret));
 
index a4c8149eb2f3826d142a8c4b0172609889eb07db..a281edcf40baa6589e5e56c9e0adf4c0be89dc80 100644 (file)
 #define swap(a, b) \
        do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/* This counts to 12. Any more, it will return 13th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
 #define _RET_IP_               (unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
index 96f6245808cfc5f772c7271a5796001803159bea..2bbe25e4dd45c8fb3126309837fe977cea43dbac 100644 (file)
@@ -23,7 +23,7 @@
 /*
  * deal with unrepresentable constant logarithms
  */
-extern __attribute__((const, noreturn))
+extern __attribute__((const))
 int ____ilog2_NaN(void);
 
 /*
index 38a5fecb46d4827b9e87c742c2f60ce0b8704938..f9bb6a4d9de32bd5caa26b9d5a47ba04dca0736b 100644 (file)
@@ -146,6 +146,7 @@ static inline struct timespec current_kernel_time(void)
        return ts;
 }
 
+#define current_kernel_time64()        current_kernel_time()
 #define CURRENT_TIME           (current_kernel_time())
 
 #endif /* __TOOLS_LINUX_SCHED_H */
index fd59a9a6e6c793417b0bc090faca03733a3f74bb..cd6cc1c19e63da6fc9103f56a6e9de94bb12d095 100644 (file)
@@ -3,6 +3,8 @@
 
 #include <linux/types.h>
 
+#define timespec64  timespec
+
 typedef __s64 time64_t;
 
 /* Parameters used to convert the timespec values: */
@@ -42,4 +44,8 @@ static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
        return t;
 }
 
+#define ns_to_timespec64       ns_to_timespec
+#define timespec64_to_ns       timespec_to_ns
+#define timespec64_trunc       timespec_trunc
+
 #endif /* _LINUX_TIME64_H */
index 13264b82ed77598c66177e7db1c9126b904f64f6..73be8873bc59c6fc8b4043d75efe38a3793a9eb4 100644 (file)
@@ -7,7 +7,7 @@
 #include <linux/tracepoint.h>
 
 DECLARE_EVENT_CLASS(bpos,
-       TP_PROTO(struct bpos p),
+       TP_PROTO(struct bpos *p),
        TP_ARGS(p),
 
        TP_STRUCT__entry(
@@ -16,8 +16,8 @@ DECLARE_EVENT_CLASS(bpos,
        ),
 
        TP_fast_assign(
-               __entry->inode  = p.inode;
-               __entry->offset = p.offset;
+               __entry->inode  = p->inode;
+               __entry->offset = p->offset;
        ),
 
        TP_printk("%llu:%llu", __entry->inode, __entry->offset)
@@ -43,21 +43,6 @@ DECLARE_EVENT_CLASS(bkey,
                  __entry->offset, __entry->size)
 );
 
-DECLARE_EVENT_CLASS(bch_dev,
-       TP_PROTO(struct bch_dev *ca),
-       TP_ARGS(ca),
-
-       TP_STRUCT__entry(
-               __array(char,           uuid,   16      )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->uuid, ca->uuid.b, 16);
-       ),
-
-       TP_printk("%pU", __entry->uuid)
-);
-
 DECLARE_EVENT_CLASS(bch_fs,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c),
@@ -138,7 +123,7 @@ DEFINE_EVENT(bio, journal_write,
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
-       TP_PROTO(struct bpos p),
+       TP_PROTO(struct bpos *p),
        TP_ARGS(p)
 );
 
@@ -360,16 +345,6 @@ DEFINE_EVENT(bch_fs, gc_coalesce_end,
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_dev, sectors_saturated,
-       TP_PROTO(struct bch_dev *ca),
-       TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_fs, gc_sectors_saturated,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
 DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
index 534ea94e545b66350623d3fc1a73d6ef8a68e1c4..5dd666ec42d271ef23e4852bd63730139f251d6f 100644 (file)
@@ -284,10 +284,9 @@ static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
                                       void *p)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct timespec now = current_time(&inode->v);
        umode_t mode = (unsigned long) p;
 
-       bi->bi_ctime    = timespec_to_bch2_time(c, now);
+       bi->bi_ctime    = bch2_current_time(c);
        bi->bi_mode     = mode;
        return 0;
 }
@@ -301,13 +300,14 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
        umode_t mode = inode->v.i_mode;
        int ret;
 
+       mutex_lock(&inode->ei_update_lock);
+       bch2_trans_init(&trans, c);
+
        if (type == ACL_TYPE_ACCESS && acl) {
                ret = posix_acl_update_mode(&inode->v, &mode, &acl);
                if (ret)
-                       return ret;
+                       goto err;
        }
-
-       bch2_trans_init(&trans, c);
 retry:
        bch2_trans_begin(&trans);
 
@@ -318,7 +318,7 @@ retry:
                bch2_write_inode_trans(&trans, inode, &inode_u,
                                       inode_update_for_set_acl_fn,
                                       (void *)(unsigned long) mode) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK);
@@ -333,6 +333,7 @@ retry:
        set_cached_acl(&inode->v, type, acl);
 err:
        bch2_trans_exit(&trans);
+       mutex_unlock(&inode->ei_update_lock);
 
        return ret;
 }
@@ -372,7 +373,7 @@ int bch2_acl_chmod(struct btree_trans *trans,
                goto err;
        }
 
-       bch2_trans_update(trans, iter, &new->k_i, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i));
        *new_acl = acl;
        acl = NULL;
 err:
index ac2c7d1ff546bc2e378d77e26e419c20e28ff7e8..3f43a1be3a12990c69a3c90b76fe5003c442b765 100644 (file)
@@ -154,8 +154,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-                       size_t size, struct bkey_s_c k)
+int bch2_alloc_to_text(struct bch_fs *c, char *buf,
+                      size_t size, struct bkey_s_c k)
 {
        buf[0] = '\0';
 
@@ -163,6 +163,8 @@ void bch2_alloc_to_text(struct bch_fs *c, char *buf,
        case BCH_ALLOC:
                break;
        }
+
+       return 0;
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
@@ -288,53 +290,41 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 
 static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
                                  size_t b, struct btree_iter *iter,
-                                 u64 *journal_seq, bool nowait)
+                                 u64 *journal_seq, unsigned flags)
 {
        struct bucket_mark m;
        __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
        struct bucket *g;
        struct bkey_i_alloc *a;
        u8 *d;
-       int ret;
-       unsigned flags = BTREE_INSERT_ATOMIC|
-               BTREE_INSERT_NOFAIL|
-               BTREE_INSERT_USE_RESERVE|
-               BTREE_INSERT_USE_ALLOC_RESERVE;
 
-       if (nowait)
-               flags |= BTREE_INSERT_NOWAIT;
+       percpu_down_read_preempt_disable(&c->usage_lock);
+       g = bucket(ca, b);
+
+       m = READ_ONCE(g->mark);
+       a = bkey_alloc_init(&alloc_key.k);
+       a->k.p          = POS(ca->dev_idx, b);
+       a->v.fields     = 0;
+       a->v.gen        = m.gen;
+       set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
+
+       d = a->v.data;
+       if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
+               put_alloc_field(&d, 2, g->io_time[READ]);
+       if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
+               put_alloc_field(&d, 2, g->io_time[WRITE]);
+       percpu_up_read_preempt_enable(&c->usage_lock);
 
-       bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+       bch2_btree_iter_cond_resched(iter);
 
-       do {
-               ret = btree_iter_err(bch2_btree_iter_peek_slot(iter));
-               if (ret)
-                       break;
+       bch2_btree_iter_set_pos(iter, a->k.p);
 
-               percpu_down_read_preempt_disable(&c->usage_lock);
-               g = bucket(ca, b);
-
-               /* read mark under btree node lock: */
-               m = READ_ONCE(g->mark);
-               a = bkey_alloc_init(&alloc_key.k);
-               a->k.p          = iter->pos;
-               a->v.fields     = 0;
-               a->v.gen        = m.gen;
-               set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v));
-
-               d = a->v.data;
-               if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-                       put_alloc_field(&d, 2, g->io_time[READ]);
-               if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-                       put_alloc_field(&d, 2, g->io_time[WRITE]);
-               percpu_up_read_preempt_enable(&c->usage_lock);
-
-               ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, flags,
-                                          BTREE_INSERT_ENTRY(iter, &a->k_i));
-               bch2_btree_iter_cond_resched(iter);
-       } while (ret == -EINTR);
-
-       return ret;
+       return bch2_btree_insert_at(c, NULL, journal_seq,
+                                   BTREE_INSERT_NOFAIL|
+                                   BTREE_INSERT_USE_RESERVE|
+                                   BTREE_INSERT_USE_ALLOC_RESERVE|
+                                   flags,
+                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
 }
 
 int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
@@ -354,8 +344,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
        bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter,
-                                    NULL, false);
+       ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
        bch2_btree_iter_unlock(&iter);
        return ret;
 }
@@ -375,8 +364,8 @@ int bch2_alloc_write(struct bch_fs *c)
 
                down_read(&ca->bucket_lock);
                for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) {
-                       ret = __bch2_alloc_write_key(c, ca, bucket, &iter,
-                                                    NULL, false);
+                       ret = __bch2_alloc_write_key(c, ca, bucket,
+                                                    &iter, NULL, 0);
                        if (ret)
                                break;
 
@@ -582,47 +571,6 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
        return gc_gen < BUCKET_GC_GEN_MAX;
 }
 
-static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                      size_t bucket)
-{
-       struct bucket_mark m;
-
-       percpu_down_read_preempt_disable(&c->usage_lock);
-       spin_lock(&c->freelist_lock);
-
-       if (!bch2_invalidate_bucket(c, ca, bucket, &m)) {
-               spin_unlock(&c->freelist_lock);
-               percpu_up_read_preempt_enable(&c->usage_lock);
-               return;
-       }
-
-       verify_not_on_freelist(c, ca, bucket);
-       BUG_ON(!fifo_push(&ca->free_inc, bucket));
-
-       spin_unlock(&c->freelist_lock);
-       percpu_up_read_preempt_enable(&c->usage_lock);
-
-       /* gc lock held: */
-       bucket_io_clock_reset(c, ca, bucket, READ);
-       bucket_io_clock_reset(c, ca, bucket, WRITE);
-
-       if (m.cached_sectors) {
-               ca->allocator_invalidating_data = true;
-       } else if (m.journal_seq_valid) {
-               u64 journal_seq = atomic64_read(&c->journal.seq);
-               u64 bucket_seq  = journal_seq;
-
-               bucket_seq &= ~((u64) U16_MAX);
-               bucket_seq |= m.journal_seq;
-
-               if (bucket_seq > journal_seq)
-                       bucket_seq -= 1 << 16;
-
-               ca->allocator_journal_seq_flush =
-                       max(ca->allocator_journal_seq_flush, bucket_seq);
-       }
-}
-
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
@@ -674,11 +622,18 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
                (l.bucket > r.bucket) - (l.bucket < r.bucket);
 }
 
+static inline int bucket_idx_cmp(const void *_l, const void *_r)
+{
+       const struct alloc_heap_entry *l = _l, *r = _r;
+
+       return (l->bucket > r->bucket) - (l->bucket < r->bucket);
+}
+
 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets;
        struct alloc_heap_entry e = { 0 };
-       size_t b;
+       size_t b, i, nr = 0;
 
        ca->alloc_heap.used = 0;
 
@@ -720,55 +675,58 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        if (e.nr)
                heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
 
-       up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
-
-       while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
-               for (b = e.bucket;
-                    b < e.bucket + e.nr;
-                    b++) {
-                       if (fifo_full(&ca->free_inc))
-                               return;
+       for (i = 0; i < ca->alloc_heap.used; i++)
+               nr += ca->alloc_heap.data[i].nr;
 
-                       bch2_invalidate_one_bucket(c, ca, b);
-               }
+       while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
+               nr -= ca->alloc_heap.data[0].nr;
+               heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp);
        }
+
+       up_read(&ca->bucket_lock);
+       mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets = bucket_array(ca);
        struct bucket_mark m;
-       size_t b, checked;
+       size_t b, start;
 
-       for (checked = 0;
-            checked < ca->mi.nbuckets && !fifo_full(&ca->free_inc);
-            checked++) {
-               if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-                   ca->fifo_last_bucket >= ca->mi.nbuckets)
-                       ca->fifo_last_bucket = ca->mi.first_bucket;
+       if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
+           ca->fifo_last_bucket >= ca->mi.nbuckets)
+               ca->fifo_last_bucket = ca->mi.first_bucket;
+
+       start = ca->fifo_last_bucket;
 
-               b = ca->fifo_last_bucket++;
+       do {
+               ca->fifo_last_bucket++;
+               if (ca->fifo_last_bucket == ca->mi.nbuckets)
+                       ca->fifo_last_bucket = ca->mi.first_bucket;
 
+               b = ca->fifo_last_bucket;
                m = READ_ONCE(buckets->b[b].mark);
 
-               if (bch2_can_invalidate_bucket(ca, b, m))
-                       bch2_invalidate_one_bucket(c, ca, b);
+               if (bch2_can_invalidate_bucket(ca, b, m)) {
+                       struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+                       heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+                       if (heap_full(&ca->alloc_heap))
+                               break;
+               }
 
                cond_resched();
-       }
+       } while (ca->fifo_last_bucket != start);
 }
 
 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets = bucket_array(ca);
        struct bucket_mark m;
-       size_t checked;
+       size_t checked, i;
 
        for (checked = 0;
-            checked < ca->mi.nbuckets / 2 && !fifo_full(&ca->free_inc);
+            checked < ca->mi.nbuckets / 2;
             checked++) {
                size_t b = bch2_rand_range(ca->mi.nbuckets -
                                           ca->mi.first_bucket) +
@@ -776,17 +734,34 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
 
                m = READ_ONCE(buckets->b[b].mark);
 
-               if (bch2_can_invalidate_bucket(ca, b, m))
-                       bch2_invalidate_one_bucket(c, ca, b);
+               if (bch2_can_invalidate_bucket(ca, b, m)) {
+                       struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
+
+                       heap_add(&ca->alloc_heap, e, bucket_alloc_cmp);
+                       if (heap_full(&ca->alloc_heap))
+                               break;
+               }
 
                cond_resched();
        }
+
+       sort(ca->alloc_heap.data,
+            ca->alloc_heap.used,
+            sizeof(ca->alloc_heap.data[0]),
+            bucket_idx_cmp, NULL);
+
+       /* remove duplicates: */
+       for (i = 0; i + 1 < ca->alloc_heap.used; i++)
+               if (ca->alloc_heap.data[i].bucket ==
+                   ca->alloc_heap.data[i + 1].bucket)
+                       ca->alloc_heap.data[i].nr = 0;
 }
 
-static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
+       size_t i, nr = 0;
+
        ca->inc_gen_needs_gc                    = 0;
-       ca->inc_gen_really_needs_gc             = 0;
 
        switch (ca->mi.replacement) {
        case CACHE_REPLACEMENT_LRU:
@@ -799,86 +774,132 @@ static void find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
                find_reclaimable_buckets_random(c, ca);
                break;
        }
+
+       heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
+
+       for (i = 0; i < ca->alloc_heap.used; i++)
+               nr += ca->alloc_heap.data[i].nr;
+
+       return nr;
 }
 
-static int size_t_cmp(const void *_l, const void *_r)
+static inline long next_alloc_bucket(struct bch_dev *ca)
 {
-       const size_t *l = _l, *r = _r;
+       struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+       while (ca->alloc_heap.used) {
+               if (top->nr) {
+                       size_t b = top->bucket;
+
+                       top->bucket++;
+                       top->nr--;
+                       return b;
+               }
+
+               heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp);
+       }
 
-       return (*l > *r) - (*l < *r);
+       return -1;
 }
 
-static void sort_free_inc(struct bch_fs *c, struct bch_dev *ca)
+static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                      size_t bucket, u64 *flush_seq)
 {
-       BUG_ON(ca->free_inc.front);
+       struct bucket_mark m;
 
+       percpu_down_read_preempt_disable(&c->usage_lock);
        spin_lock(&c->freelist_lock);
-       sort(ca->free_inc.data,
-            ca->free_inc.back,
-            sizeof(ca->free_inc.data[0]),
-            size_t_cmp, NULL);
+
+       bch2_invalidate_bucket(c, ca, bucket, &m);
+
+       verify_not_on_freelist(c, ca, bucket);
+       BUG_ON(!fifo_push(&ca->free_inc, bucket));
+
        spin_unlock(&c->freelist_lock);
+
+       bucket_io_clock_reset(c, ca, bucket, READ);
+       bucket_io_clock_reset(c, ca, bucket, WRITE);
+
+       percpu_up_read_preempt_enable(&c->usage_lock);
+
+       if (m.journal_seq_valid) {
+               u64 journal_seq = atomic64_read(&c->journal.seq);
+               u64 bucket_seq  = journal_seq;
+
+               bucket_seq &= ~((u64) U16_MAX);
+               bucket_seq |= m.journal_seq;
+
+               if (bucket_seq > journal_seq)
+                       bucket_seq -= 1 << 16;
+
+               *flush_seq = max(*flush_seq, bucket_seq);
+       }
+
+       return m.cached_sectors != 0;
 }
 
-static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
-                                   u64 *journal_seq, size_t nr,
-                                   bool nowait)
+/*
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
+ */
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
        struct btree_iter iter;
+       u64 journal_seq = 0;
        int ret = 0;
+       long b;
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        /* Only use nowait if we've already invalidated at least one bucket: */
-       while (ca->nr_invalidated < min(nr, fifo_used(&ca->free_inc))) {
-               size_t b = fifo_idx_entry(&ca->free_inc, ca->nr_invalidated);
-
-               ret = __bch2_alloc_write_key(c, ca, b, &iter, journal_seq,
-                                            nowait && ca->nr_invalidated);
-               if (ret)
-                       break;
-
-               ca->nr_invalidated++;
+       while (!ret &&
+              !fifo_full(&ca->free_inc) &&
+              (b = next_alloc_bucket(ca)) >= 0) {
+               bool must_flush =
+                       bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
+
+               ret = __bch2_alloc_write_key(c, ca, b, &iter,
+                               must_flush ? &journal_seq : NULL,
+                               !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
        }
 
        bch2_btree_iter_unlock(&iter);
 
        /* If we used NOWAIT, don't return the error: */
-       return ca->nr_invalidated ? 0 : ret;
-}
-
-static bool __push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
-{
-       unsigned i;
+       if (!fifo_empty(&ca->free_inc))
+               ret = 0;
+       if (ret) {
+               bch_err(ca, "error invalidating buckets: %i", ret);
+               return ret;
+       }
 
-       /*
-        * Don't remove from free_inc until after it's added to
-        * freelist, so gc can find it:
-        */
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++)
-               if (fifo_push(&ca->free[i], bucket)) {
-                       fifo_pop(&ca->free_inc, bucket);
-                       --ca->nr_invalidated;
-                       closure_wake_up(&c->freelist_wait);
-                       spin_unlock(&c->freelist_lock);
-                       return true;
-               }
-       spin_unlock(&c->freelist_lock);
+       if (journal_seq)
+               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
+       if (ret) {
+               bch_err(ca, "journal error: %i", ret);
+               return ret;
+       }
 
-       return false;
+       return 0;
 }
 
 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket)
 {
+       unsigned i;
        int ret = 0;
 
        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
 
-               if (__push_invalidated_bucket(c, ca, bucket))
-                       break;
+               spin_lock(&c->freelist_lock);
+               for (i = 0; i < RESERVE_NR; i++)
+                       if (fifo_push(&ca->free[i], bucket)) {
+                               fifo_pop(&ca->free_inc, bucket);
+                               closure_wake_up(&c->freelist_wait);
+                               spin_unlock(&c->freelist_lock);
+                               goto out;
+                       }
+               spin_unlock(&c->freelist_lock);
 
                if ((current->flags & PF_KTHREAD) &&
                    kthread_should_stop()) {
@@ -889,22 +910,20 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
                schedule();
                try_to_freeze();
        }
-
+out:
        __set_current_state(TASK_RUNNING);
        return ret;
 }
 
 /*
- * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
- * then add it to the freelist, waiting until there's room if necessary:
+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to
+ * freelists, waiting until there's room if necessary:
  */
 static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
-       while (ca->nr_invalidated) {
+       while (!fifo_empty(&ca->free_inc)) {
                size_t bucket = fifo_peek(&ca->free_inc);
 
-               BUG_ON(fifo_empty(&ca->free_inc) || !ca->nr_invalidated);
-
                if (ca->mi.discard &&
                    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
                        blkdev_issue_discard(ca->disk_sb.bdev,
@@ -930,68 +949,37 @@ static int bch2_allocator_thread(void *arg)
 {
        struct bch_dev *ca = arg;
        struct bch_fs *c = ca->fs;
-       u64 journal_seq;
+       size_t nr;
        int ret;
 
        set_freezable();
 
        while (1) {
-               while (1) {
-                       cond_resched();
-
-                       pr_debug("discarding %zu invalidated buckets",
-                                ca->nr_invalidated);
-
-                       ret = discard_invalidated_buckets(c, ca);
-                       if (ret)
-                               goto stop;
-
-                       if (fifo_empty(&ca->free_inc))
-                               break;
+               cond_resched();
 
-                       pr_debug("invalidating %zu buckets",
-                                fifo_used(&ca->free_inc));
+               pr_debug("discarding %zu invalidated buckets",
+                        fifo_used(&ca->free_inc));
 
-                       journal_seq = 0;
-                       ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-                                                      SIZE_MAX, true);
-                       if (ret) {
-                               bch_err(ca, "error invalidating buckets: %i", ret);
-                               goto stop;
-                       }
+               ret = discard_invalidated_buckets(c, ca);
+               if (ret)
+                       goto stop;
 
-                       if (!ca->nr_invalidated) {
-                               bch_err(ca, "allocator thread unable to make forward progress!");
-                               goto stop;
-                       }
+               down_read(&c->gc_lock);
 
-                       if (ca->allocator_invalidating_data)
-                               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
-                       else if (ca->allocator_journal_seq_flush)
-                               ret = bch2_journal_flush_seq(&c->journal,
-                                                      ca->allocator_journal_seq_flush);
+               ret = bch2_invalidate_buckets(c, ca);
+               if (ret) {
+                       up_read(&c->gc_lock);
+                       goto stop;
+               }
 
-                       /*
-                        * journal error - buckets haven't actually been
-                        * invalidated, can't discard them:
-                        */
-                       if (ret) {
-                               bch_err(ca, "journal error: %i", ret);
-                               goto stop;
-                       }
+               if (!fifo_empty(&ca->free_inc)) {
+                       up_read(&c->gc_lock);
+                       continue;
                }
 
                pr_debug("free_inc now empty");
 
-               /* Reset front/back so we can easily sort fifo entries later: */
-               ca->free_inc.front = ca->free_inc.back  = 0;
-               ca->allocator_journal_seq_flush         = 0;
-               ca->allocator_invalidating_data         = false;
-
-               down_read(&c->gc_lock);
-               while (1) {
-                       size_t prev = fifo_used(&ca->free_inc);
-
+               do {
                        if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
                                up_read(&c->gc_lock);
                                bch_err(ca, "gc failure");
@@ -1007,56 +995,46 @@ static int bch2_allocator_thread(void *arg)
 
                        pr_debug("scanning for reclaimable buckets");
 
-                       find_reclaimable_buckets(c, ca);
+                       nr = find_reclaimable_buckets(c, ca);
 
-                       pr_debug("found %zu buckets (free_inc %zu/%zu)",
-                                fifo_used(&ca->free_inc) - prev,
-                                fifo_used(&ca->free_inc), ca->free_inc.size);
+                       pr_debug("found %zu buckets", nr);
 
-                       trace_alloc_batch(ca, fifo_used(&ca->free_inc),
-                                         ca->free_inc.size);
+                       trace_alloc_batch(ca, nr, ca->alloc_heap.size);
 
-                       if ((ca->inc_gen_needs_gc >= ca->free_inc.size ||
-                            (!fifo_full(&ca->free_inc) &&
-                             ca->inc_gen_really_needs_gc >=
-                             fifo_free(&ca->free_inc))) &&
+                       if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
+                            ca->inc_gen_really_needs_gc) &&
                            c->gc_thread) {
                                atomic_inc(&c->kick_gc);
                                wake_up_process(c->gc_thread);
                        }
 
-                       if (fifo_full(&ca->free_inc))
-                               break;
-
-                       if (!fifo_empty(&ca->free_inc) &&
-                           !fifo_full(&ca->free[RESERVE_MOVINGGC]))
-                               break;
-
                        /*
-                        * copygc may be waiting until either its reserve fills
-                        * up, or we can't make forward progress:
+                        * If we found any buckets, we have to invalidate them
+                        * before we scan for more - but if we didn't find very
+                        * many we may want to wait on more buckets being
+                        * available so we don't spin:
                         */
-                       ca->allocator_blocked = true;
-                       closure_wake_up(&c->freelist_wait);
-
-                       ret = wait_buckets_available(c, ca);
-                       if (ret) {
-                               up_read(&c->gc_lock);
-                               goto stop;
+                       if (!nr ||
+                           (nr < ALLOC_SCAN_BATCH(ca) &&
+                            !fifo_full(&ca->free[RESERVE_MOVINGGC]))) {
+                               ca->allocator_blocked = true;
+                               closure_wake_up(&c->freelist_wait);
+
+                               ret = wait_buckets_available(c, ca);
+                               if (ret) {
+                                       up_read(&c->gc_lock);
+                                       goto stop;
+                               }
                        }
-               }
+               } while (!nr);
 
                ca->allocator_blocked = false;
                up_read(&c->gc_lock);
 
-               pr_debug("free_inc now %zu/%zu",
-                        fifo_used(&ca->free_inc),
-                        ca->free_inc.size);
-
-               sort_free_inc(c, ca);
+               pr_debug("%zu buckets to invalidate", nr);
 
                /*
-                * free_inc is now full of newly-invalidated buckets: next,
+                * alloc_heap is now full of newly-invalidated buckets: next,
                 * write out the new bucket gens:
                 */
        }
@@ -1733,7 +1711,7 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 void bch2_recalc_capacity(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       u64 total_capacity, capacity = 0, reserved_sectors = 0;
+       u64 capacity = 0, reserved_sectors = 0, gc_reserve;
        unsigned long ra_pages = 0;
        unsigned i, j;
 
@@ -1748,7 +1726,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
        bch2_set_ra_pages(c, ra_pages);
 
        for_each_rw_member(ca, c, i) {
-               size_t reserve = 0;
+               u64 dev_reserve = 0;
 
                /*
                 * We need to reserve buckets (from the number
@@ -1767,30 +1745,36 @@ void bch2_recalc_capacity(struct bch_fs *c)
                 * not -ENOSPC calculations.
                 */
                for (j = 0; j < RESERVE_NONE; j++)
-                       reserve += ca->free[j].size;
+                       dev_reserve += ca->free[j].size;
+
+               dev_reserve += ca->free_inc.size;
 
-               reserve += ca->free_inc.size;
+               dev_reserve += ARRAY_SIZE(c->write_points);
 
-               reserve += ARRAY_SIZE(c->write_points);
+               dev_reserve += 1;       /* btree write point */
+               dev_reserve += 1;       /* copygc write point */
+               dev_reserve += 1;       /* rebalance write point */
+               dev_reserve += WRITE_POINT_COUNT;
 
-               reserve += 1;   /* btree write point */
+               dev_reserve *= ca->mi.bucket_size;
 
-               reserved_sectors += bucket_to_sector(ca, reserve);
+               ca->copygc_threshold = dev_reserve;
 
                capacity += bucket_to_sector(ca, ca->mi.nbuckets -
                                             ca->mi.first_bucket);
-       }
 
-       total_capacity = capacity;
+               reserved_sectors += dev_reserve * 2;
+       }
 
-       capacity *= (100 - c->opts.gc_reserve_percent);
-       capacity = div64_u64(capacity, 100);
+       gc_reserve = c->opts.gc_reserve_bytes
+               ? c->opts.gc_reserve_bytes >> 9
+               : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
 
-       BUG_ON(reserved_sectors > total_capacity);
+       reserved_sectors = max(gc_reserve, reserved_sectors);
 
-       capacity = min(capacity, total_capacity - reserved_sectors);
+       reserved_sectors = min(reserved_sectors, capacity);
 
-       c->capacity = capacity;
+       c->capacity = capacity - reserved_sectors;
 
        if (c->capacity) {
                bch2_io_timer_add(&c->io_clock[READ],
@@ -1946,39 +1930,83 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
        return 0;
 }
 
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+       struct bucket_table *tbl;
+       struct rhash_head *pos;
+       struct btree *b;
+       bool flush_updates;
+       size_t i, nr_pending_updates;
+
+       clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+again:
+       pr_debug("flushing dirty btree nodes");
+       cond_resched();
+
+       flush_updates = false;
+       nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+       rcu_read_lock();
+       for_each_cached_btree(b, c, tbl, i, pos)
+               if (btree_node_dirty(b) && (!b->written || b->level)) {
+                       if (btree_node_may_write(b)) {
+                               rcu_read_unlock();
+                               btree_node_lock_type(c, b, SIX_LOCK_read);
+                               bch2_btree_node_write(c, b, SIX_LOCK_read);
+                               six_unlock_read(&b->lock);
+                               goto again;
+                       } else {
+                               flush_updates = true;
+                       }
+               }
+       rcu_read_unlock();
+
+       if (c->btree_roots_dirty)
+               bch2_journal_meta(&c->journal);
+
+       /*
+        * This is ugly, but it's needed to flush btree node writes
+        * without spinning...
+        */
+       if (flush_updates) {
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  bch2_btree_interior_updates_nr_pending(c) <
+                                  nr_pending_updates);
+               goto again;
+       }
+
+}
+
 static void allocator_start_issue_discards(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned dev_iter;
-       size_t i, bu;
-
-       for_each_rw_member(ca, c, dev_iter) {
-               unsigned done = 0;
-
-               fifo_for_each_entry(bu, &ca->free_inc, i) {
-                       if (done == ca->nr_invalidated)
-                               break;
+       size_t bu;
 
+       for_each_rw_member(ca, c, dev_iter)
+               while (fifo_pop(&ca->free_inc, bu))
                        blkdev_issue_discard(ca->disk_sb.bdev,
                                             bucket_to_sector(ca, bu),
                                             ca->mi.bucket_size, GFP_NOIO, 0);
-                       done++;
-               }
-       }
 }
 
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       size_t bu, i;
        unsigned dev_iter;
        u64 journal_seq = 0;
+       long bu;
        bool invalidating_data = false;
        int ret = 0;
 
        if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
                return -1;
 
+       if (test_alloc_startup(c)) {
+               invalidating_data = true;
+               goto not_enough;
+       }
+
        /* Scan for buckets that are already invalidated: */
        for_each_rw_member(ca, c, dev_iter) {
                struct btree_iter iter;
@@ -2003,7 +2031,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
                        percpu_up_read_preempt_enable(&c->usage_lock);
 
                        fifo_push(&ca->free_inc, bu);
-                       ca->nr_invalidated++;
 
                        if (fifo_full(&ca->free_inc))
                                break;
@@ -2022,24 +2049,23 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 not_enough:
        pr_debug("did not find enough empty buckets; issuing discards");
 
-       /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
+       /* clear out free_inc, we'll be using it again below: */
        for_each_rw_member(ca, c, dev_iter)
                discard_invalidated_buckets(c, ca);
 
        pr_debug("scanning for reclaimable buckets");
 
        for_each_rw_member(ca, c, dev_iter) {
-               BUG_ON(!fifo_empty(&ca->free_inc));
-               ca->free_inc.front = ca->free_inc.back  = 0;
-
                find_reclaimable_buckets(c, ca);
-               sort_free_inc(c, ca);
 
-               invalidating_data |= ca->allocator_invalidating_data;
+               while (!fifo_full(&ca->free[RESERVE_BTREE]) &&
+                      (bu = next_alloc_bucket(ca)) >= 0) {
+                       invalidating_data |=
+                               bch2_invalidate_one_bucket(c, ca, bu, &journal_seq);
 
-               fifo_for_each_entry(bu, &ca->free_inc, i)
-                       if (!fifo_push(&ca->free[RESERVE_BTREE], bu))
-                               break;
+                       fifo_push(&ca->free[RESERVE_BTREE], bu);
+                       set_bit(bu, ca->buckets_dirty);
+               }
        }
 
        pr_debug("done scanning for reclaimable buckets");
@@ -2054,6 +2080,8 @@ not_enough:
         * invalidated on disk:
         */
        if (invalidating_data) {
+               BUG();
+               pr_info("holding writes");
                pr_debug("invalidating existing data");
                set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
        } else {
@@ -2065,16 +2093,9 @@ not_enough:
         * XXX: it's possible for this to deadlock waiting on journal reclaim,
         * since we're holding btree writes. What then?
         */
-
-       for_each_rw_member(ca, c, dev_iter) {
-               ret = bch2_invalidate_free_inc(c, ca, &journal_seq,
-                                              ca->free[RESERVE_BTREE].size,
-                                              false);
-               if (ret) {
-                       percpu_ref_put(&ca->io_ref);
-                       return ret;
-               }
-       }
+       ret = bch2_alloc_write(c);
+       if (ret)
+               return ret;
 
        if (invalidating_data) {
                pr_debug("flushing journal");
@@ -2087,57 +2108,11 @@ not_enough:
                allocator_start_issue_discards(c);
        }
 
-       for_each_rw_member(ca, c, dev_iter)
-               while (ca->nr_invalidated) {
-                       BUG_ON(!fifo_pop(&ca->free_inc, bu));
-                       ca->nr_invalidated--;
-               }
-
        set_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags);
 
        /* now flush dirty btree nodes: */
-       if (invalidating_data) {
-               struct bucket_table *tbl;
-               struct rhash_head *pos;
-               struct btree *b;
-               bool flush_updates;
-               size_t nr_pending_updates;
-
-               clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-again:
-               pr_debug("flushing dirty btree nodes");
-               cond_resched();
-
-               flush_updates = false;
-               nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
-
-
-               rcu_read_lock();
-               for_each_cached_btree(b, c, tbl, i, pos)
-                       if (btree_node_dirty(b) && (!b->written || b->level)) {
-                               if (btree_node_may_write(b)) {
-                                       rcu_read_unlock();
-                                       btree_node_lock_type(c, b, SIX_LOCK_read);
-                                       bch2_btree_node_write(c, b, SIX_LOCK_read);
-                                       six_unlock_read(&b->lock);
-                                       goto again;
-                               } else {
-                                       flush_updates = true;
-                               }
-                       }
-               rcu_read_unlock();
-
-               /*
-                * This is ugly, but it's needed to flush btree node writes
-                * without spinning...
-                */
-               if (flush_updates) {
-                       closure_wait_event(&c->btree_interior_update_wait,
-                               bch2_btree_interior_updates_nr_pending(c) <
-                               nr_pending_updates);
-                       goto again;
-               }
-       }
+       if (invalidating_data)
+               flush_held_btree_writes(c);
 
        return 0;
 }
index 00d01f464c68f3a95c30883f076cb82eeede8aa5..739df233236c33c8330d36ede894bd9602be16f4 100644 (file)
@@ -9,8 +9,10 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
+#define ALLOC_SCAN_BATCH(ca)           ((ca)->mi.nbuckets >> 9)
+
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_alloc_ops (struct bkey_ops) {                \
        .key_invalid    = bch2_alloc_invalid,           \
index bd5ea6fc59d7d8bf0180adc25f2d92f86d57e743..92727cca2d756eb9f53b2a741980bb7d78919262 100644 (file)
@@ -267,6 +267,10 @@ do {                                                                       \
                "Store the journal sequence number in the version "     \
                "number of every btree key, and verify that btree "     \
                "update ordering is preserved during recovery")         \
+       BCH_DEBUG_PARAM(test_alloc_startup,                             \
+               "Force allocator startup to use the slowpath where it"  \
+               "can't find enough free buckets without invalidating"   \
+               "cached data")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -400,7 +404,6 @@ struct bch_dev {
        alloc_fifo              free[RESERVE_NR];
        alloc_fifo              free_inc;
        spinlock_t              freelist_lock;
-       size_t                  nr_invalidated;
 
        u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
        unsigned                open_buckets_partial_nr;
@@ -410,11 +413,8 @@ struct bch_dev {
        /* last calculated minimum prio */
        u16                     max_last_bucket_io[2];
 
-       atomic_long_t           saturated_count;
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
-       u64                     allocator_journal_seq_flush;
-       bool                    allocator_invalidating_data;
        bool                    allocator_blocked;
 
        alloc_heap              alloc_heap;
@@ -424,6 +424,7 @@ struct bch_dev {
        copygc_heap             copygc_heap;
        struct bch_pd_controller copygc_pd;
        struct write_point      copygc_write_point;
+       u64                     copygc_threshold;
 
        atomic64_t              rebalance_work;
 
@@ -576,6 +577,8 @@ struct bch_fs {
        struct mutex            btree_interior_update_lock;
        struct closure_waitlist btree_interior_update_wait;
 
+       mempool_t               btree_iters_pool;
+
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
@@ -716,7 +719,7 @@ struct bch_fs {
 
        struct journal          journal;
 
-       unsigned                bucket_journal_seq;
+       u64                     last_bucket_seq_cleanup;
 
        /* The rest of this all shows up in sysfs */
        atomic_long_t           read_realloc_races;
index e300738d6c61b6eb21bdbe86d6807035d98b0112..f1814f4caf2114cc5b3583179469b9f42df2b926 100644 (file)
@@ -1214,6 +1214,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,    struct bch_sb, flags[1], 52, 64);
 
 LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
                                        struct bch_sb, flags[2],  0,  4);
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,  struct bch_sb, flags[2],  4, 64);
 
 /* Features: */
 enum bch_sb_features {
index e4f62f905f114b6e460428d1b8e151f8c8c2d33a..bbe9af67bb2dfc61c84c9aae6d5ebb863e857bfa 100644 (file)
@@ -122,16 +122,27 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
 #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
 
+int bch2_bpos_to_text(char *buf, size_t size, struct bpos pos)
+{
+       char *out = buf, *end = buf + size;
+
+       if (!bkey_cmp(pos, POS_MIN))
+               p("POS_MIN");
+       else if (!bkey_cmp(pos, POS_MAX))
+               p("POS_MAX");
+       else
+               p("%llu:%llu", pos.inode, pos.offset);
+
+       return out - buf;
+}
+
 int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
        char *out = buf, *end = buf + size;
 
        p("u64s %u type %u ", k->u64s, k->type);
 
-       if (bkey_cmp(k->p, POS_MAX))
-               p("%llu:%llu", k->p.inode, k->p.offset);
-       else
-               p("POS_MAX");
+       out += bch2_bpos_to_text(out, end - out, k->p);
 
        p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
 
@@ -159,7 +170,7 @@ int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
                break;
        default:
                if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
-                       ops->val_to_text(c, buf, size, k);
+                       out += ops->val_to_text(c, out, end - out, k);
                break;
        }
 
index 9e2c90d54e4242681ef499c5b9dbc35646b8ac8a..c708f8c09cabad1719ab7570ae7bbe8ffeed7b5a 100644 (file)
@@ -56,7 +56,7 @@ struct bkey_ops {
                                       struct bkey_s_c);
        void            (*key_debugcheck)(struct bch_fs *, struct btree *,
                                          struct bkey_s_c);
-       void            (*val_to_text)(struct bch_fs *, char *,
+       int             (*val_to_text)(struct bch_fs *, char *,
                                       size_t, struct bkey_s_c);
        void            (*swab)(const struct bkey_format *, struct bkey_packed *);
        key_filter_fn   key_normalize;
@@ -72,6 +72,7 @@ const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
 
+int bch2_bpos_to_text(char *, size_t, struct bpos);
 int bch2_bkey_to_text(char *, size_t, const struct bkey *);
 int bch2_val_to_text(struct bch_fs *, enum bkey_type,
                     char *, size_t, struct bkey_s_c);
index 8c77fc509b55fdef2f005b254ab9d87ae1a5f26b..fdd624a1ae4ece24f34bdb765a4204c95222dd45 100644 (file)
 #include "alloc_types.h"
 #include <trace/events/bcachefs.h>
 
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
+                                                 struct btree *);
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
+       unsigned offset = __btree_node_key_to_offset(b, k);
        struct bset_tree *t;
 
        for_each_bset(b, t)
-               if (k >= btree_bkey_first(b, t) &&
-                   k < btree_bkey_last(b, t))
+               if (offset <= t->end_offset) {
+                       EBUG_ON(offset < btree_bkey_first_offset(t));
                        return t;
+               }
 
        BUG();
 }
@@ -64,8 +69,8 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
                _n = bkey_next(_k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &k);
-               printk(KERN_ERR "block %u key %zi/%u: %s\n", set,
-                      _k->_data - i->_data, i->u64s, buf);
+               printk(KERN_ERR "block %u key %5u: %s\n", set,
+                      __btree_node_key_to_offset(b, _k), buf);
 
                if (_n == vstruct_last(i))
                        continue;
@@ -121,20 +126,6 @@ void bch2_dump_btree_node_iter(struct btree *b,
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static bool keys_out_of_order(struct btree *b,
-                             const struct bkey_packed *prev,
-                             const struct bkey_packed *next,
-                             bool is_extents)
-{
-       struct bkey nextu = bkey_unpack_key(b, next);
-
-       return bkey_cmp_left_packed_byval(b, prev, bkey_start_pos(&nextu)) > 0 ||
-               ((is_extents
-                 ? !bkey_deleted(next)
-                 : !bkey_deleted(prev)) &&
-                !bkey_cmp_packed(b, prev, next));
-}
-
 void __bch2_verify_btree_nr_keys(struct btree *b)
 {
        struct bset_tree *t;
@@ -151,16 +142,21 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
        BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
 }
 
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-                                          struct btree *b,
-                                          struct bkey_packed *k)
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+                                           struct btree *b)
 {
-       const struct bkey_packed *n = bch2_btree_node_iter_peek_all(iter, b);
+       struct btree_node_iter iter = *_iter;
+       const struct bkey_packed *k, *n;
+
+       k = bch2_btree_node_iter_peek_all(&iter, b);
+       __bch2_btree_node_iter_advance(&iter, b);
+       n = bch2_btree_node_iter_peek_all(&iter, b);
 
        bkey_unpack_key(b, k);
 
        if (n &&
-           keys_out_of_order(b, k, n, iter->is_extents)) {
+           __btree_node_iter_cmp(b, k, n) > 0) {
+               struct btree_node_iter_set *set;
                struct bkey ku = bkey_unpack_key(b, k);
                struct bkey nu = bkey_unpack_key(b, n);
                char buf1[80], buf2[80];
@@ -168,106 +164,104 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
                bch2_dump_btree_node(b);
                bch2_bkey_to_text(buf1, sizeof(buf1), &ku);
                bch2_bkey_to_text(buf2, sizeof(buf2), &nu);
-               panic("out of order/overlapping:\n%s\n%s\n", buf1, buf2);
+               printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
+                      buf1, buf2);
+               printk(KERN_ERR "iter was:");
+
+               btree_node_iter_for_each(_iter, set) {
+                       struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
+                       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+                       printk(" [%zi %zi]", t - b->set,
+                              k->_data - bset(b, t)->_data);
+               }
+               panic("\n");
        }
 }
 
 void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
-                               struct btree *b)
+                                struct btree *b)
 {
-       struct btree_node_iter_set *set, *prev = NULL;
+       struct btree_node_iter_set *set, *s2;
        struct bset_tree *t;
-       struct bkey_packed *k, *first;
 
-       if (bch2_btree_node_iter_end(iter))
-               return;
+       /* Verify no duplicates: */
+       btree_node_iter_for_each(iter, set)
+               btree_node_iter_for_each(iter, s2)
+                       BUG_ON(set != s2 && set->end == s2->end);
 
+       /* Verify that set->end is correct: */
        btree_node_iter_for_each(iter, set) {
-               k = __btree_node_offset_to_key(b, set->k);
-               t = bch2_bkey_to_bset(b, k);
-
-               BUG_ON(__btree_node_offset_to_key(b, set->end) !=
-                      btree_bkey_last(b, t));
-
-               BUG_ON(prev &&
-                      btree_node_iter_cmp(iter, b, *prev, *set) > 0);
-
-               prev = set;
+               for_each_bset(b, t)
+                       if (set->end == t->end_offset)
+                               goto found;
+               BUG();
+found:
+               BUG_ON(set->k < btree_bkey_first_offset(t) ||
+                      set->k >= t->end_offset);
        }
 
-       first = __btree_node_offset_to_key(b, iter->data[0].k);
-
-       for_each_bset(b, t)
-               if (bch2_btree_node_iter_bset_pos(iter, b, t) ==
-                   btree_bkey_last(b, t) &&
-                   (k = bch2_bkey_prev_all(b, t, btree_bkey_last(b, t))))
-                       BUG_ON(__btree_node_iter_cmp(iter->is_extents, b,
-                                                    k, first) > 0);
+       /* Verify iterator is sorted: */
+       btree_node_iter_for_each(iter, set)
+               BUG_ON(set != iter->data &&
+                      btree_node_iter_cmp(b, set[-1], set[0]) > 0);
 }
 
-void bch2_verify_key_order(struct btree *b,
-                         struct btree_node_iter *iter,
-                         struct bkey_packed *where)
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+                           struct bkey_packed *insert, unsigned clobber_u64s)
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
-       struct bkey_packed *k, *prev;
-       struct bkey uk, uw = bkey_unpack_key(b, where);
-
-       k = bch2_bkey_prev_all(b, t, where);
-       if (k &&
-           keys_out_of_order(b, k, where, iter->is_extents)) {
-               char buf1[100], buf2[100];
+       struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
+       struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+#if 0
+       BUG_ON(prev &&
+              __btree_node_iter_cmp(b, prev, insert) > 0);
+#else
+       if (prev &&
+           __btree_node_iter_cmp(b, prev, insert) > 0) {
+               struct bkey k1 = bkey_unpack_key(b, prev);
+               struct bkey k2 = bkey_unpack_key(b, insert);
+               char buf1[100];
+               char buf2[100];
 
                bch2_dump_btree_node(b);
-               uk = bkey_unpack_key(b, k);
-               bch2_bkey_to_text(buf1, sizeof(buf1), &uk);
-               bch2_bkey_to_text(buf2, sizeof(buf2), &uw);
-               panic("out of order with prev:\n%s\n%s\n",
-                     buf1, buf2);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+               panic("prev > insert:\n"
+                     "prev    key %5u %s\n"
+                     "insert  key %5u %s\n",
+                      __btree_node_key_to_offset(b, prev), buf1,
+                      __btree_node_key_to_offset(b, insert), buf2);
        }
+#endif
+#if 0
+       BUG_ON(next != btree_bkey_last(b, t) &&
+              __btree_node_iter_cmp(b, insert, next) > 0);
+#else
+       if (next != btree_bkey_last(b, t) &&
+           __btree_node_iter_cmp(b, insert, next) > 0) {
+               struct bkey k1 = bkey_unpack_key(b, insert);
+               struct bkey k2 = bkey_unpack_key(b, next);
+               char buf1[100];
+               char buf2[100];
 
-       k = bkey_next(where);
-       BUG_ON(k != btree_bkey_last(b, t) &&
-              keys_out_of_order(b, where, k, iter->is_extents));
-
-       for_each_bset(b, t) {
-               if (where >= btree_bkey_first(b, t) ||
-                   where < btree_bkey_last(b, t))
-                       continue;
-
-               k = bch2_btree_node_iter_bset_pos(iter, b, t);
-
-               if (k == btree_bkey_last(b, t))
-                       k = bch2_bkey_prev_all(b, t, k);
-
-               while (bkey_cmp_left_packed_byval(b, k, bkey_start_pos(&uw)) > 0 &&
-                      (prev = bch2_bkey_prev_all(b, t, k)))
-                       k = prev;
-
-               for (;
-                    k != btree_bkey_last(b, t);
-                    k = bkey_next(k)) {
-                       uk = bkey_unpack_key(b, k);
-
-                       if (iter->is_extents) {
-                               BUG_ON(!(bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0 ||
-                                        bkey_cmp(uk.p, bkey_start_pos(&uw)) <= 0));
-                       } else {
-                               BUG_ON(!bkey_cmp(uw.p, uk.p) &&
-                                      !bkey_deleted(&uk));
-                       }
-
-                       if (bkey_cmp(uw.p, bkey_start_pos(&uk)) <= 0)
-                               break;
-               }
+               bch2_dump_btree_node(b);
+               bch2_bkey_to_text(buf1, sizeof(buf1), &k1);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &k2);
+
+               panic("insert > next:\n"
+                     "insert  key %5u %s\n"
+                     "next    key %5u %s\n",
+                      __btree_node_key_to_offset(b, insert), buf1,
+                      __btree_node_key_to_offset(b, next), buf2);
        }
+#endif
 }
 
 #else
 
 static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
-                                                  struct btree *b,
-                                                  struct bkey_packed *k) {}
+                                                  struct btree *b) {}
 
 #endif
 
@@ -622,28 +616,30 @@ static unsigned rw_aux_tree_bsearch(struct btree *b,
                                    struct bset_tree *t,
                                    unsigned offset)
 {
-       unsigned l = 0, r = t->size;
+       unsigned bset_offs = offset - btree_bkey_first_offset(t);
+       unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
+       unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
 
        EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
+       EBUG_ON(!t->size);
+       EBUG_ON(idx > t->size);
 
-       while (l < r) {
-               unsigned m = (l + r) >> 1;
-
-               if (rw_aux_tree(b, t)[m].offset < offset)
-                       l = m + 1;
-               else
-                       r = m;
-       }
+       while (idx < t->size &&
+              rw_aux_tree(b, t)[idx].offset < offset)
+               idx++;
 
-       EBUG_ON(l < t->size &&
-               rw_aux_tree(b, t)[l].offset < offset);
-       EBUG_ON(l &&
-               rw_aux_tree(b, t)[l - 1].offset >= offset);
+       while (idx &&
+              rw_aux_tree(b, t)[idx - 1].offset >= offset)
+               idx--;
 
-       EBUG_ON(l > r);
-       EBUG_ON(l > t->size);
+       EBUG_ON(idx < t->size &&
+               rw_aux_tree(b, t)[idx].offset < offset);
+       EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
+       EBUG_ON(idx + 1 < t->size &&
+               rw_aux_tree(b, t)[idx].offset ==
+               rw_aux_tree(b, t)[idx + 1].offset);
 
-       return l;
+       return idx;
 }
 
 static inline unsigned bfloat_mantissa(const struct bkey_float *f,
@@ -1129,9 +1125,10 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
  * modified, fix any auxiliary search tree by remaking all the nodes in the
  * auxiliary search tree that @k corresponds to
  */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bset_tree *t,
-                                  struct bkey_packed *k)
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
 {
+       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
        switch (bset_aux_tree_type(t)) {
        case BSET_NO_AUX_TREE:
                break;
@@ -1158,13 +1155,9 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
        if (!bset_has_rw_aux_tree(t))
                return;
 
+       /* returns first entry >= where */
        l = rw_aux_tree_bsearch(b, t, where);
 
-       /* l is first >= than @where */
-
-       EBUG_ON(l < t->size && rw_aux_tree(b, t)[l].offset < where);
-       EBUG_ON(l && rw_aux_tree(b, t)[l - 1].offset >= where);
-
        if (!l) /* never delete first entry */
                l++;
        else if (l < t->size &&
@@ -1242,6 +1235,7 @@ void bch2_bset_insert(struct btree *b,
        struct bkey_packed packed, *src = bkey_to_packed(insert);
 
        bch2_bset_verify_rw_aux_tree(b, t);
+       bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
 
        if (bch2_bkey_pack_key(&packed, &insert->k, f))
                src = &packed;
@@ -1268,7 +1262,6 @@ void bch2_bset_insert(struct btree *b,
 
        bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
 
-       bch2_verify_key_order(b, iter, where);
        bch2_verify_btree_nr_keys(b);
 }
 
@@ -1474,11 +1467,11 @@ void bch2_btree_node_iter_push(struct btree_node_iter *iter,
 noinline __flatten __attribute__((cold))
 static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
                              struct btree *b, struct bpos search,
-                             bool strictly_greater, bool is_extents)
+                             bool strictly_greater)
 {
        struct bset_tree *t;
 
-       trace_bkey_pack_pos_fail(search);
+       trace_bkey_pack_pos_fail(&search);
 
        for_each_bset(b, t)
                __bch2_btree_node_iter_push(iter, b,
@@ -1531,7 +1524,7 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
  */
 void bch2_btree_node_iter_init(struct btree_node_iter *iter,
                               struct btree *b, struct bpos search,
-                              bool strictly_greater, bool is_extents)
+                              bool strictly_greater)
 {
        struct bset_tree *t;
        struct bkey_packed p, *packed_search = NULL;
@@ -1539,7 +1532,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
        EBUG_ON(bkey_cmp(search, b->data->min_key) < 0);
        bset_aux_tree_verify(b);
 
-       __bch2_btree_node_iter_init(iter, is_extents);
+       memset(iter, 0, sizeof(*iter));
 
        switch (bch2_bkey_pack_pos_lossy(&p, search, b)) {
        case BKEY_PACK_POS_EXACT:
@@ -1550,7 +1543,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
                break;
        case BKEY_PACK_POS_FAIL:
                btree_node_iter_init_pack_failed(iter, b, search,
-                                       strictly_greater, is_extents);
+                                                strictly_greater);
                return;
        }
 
@@ -1565,12 +1558,11 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter,
 }
 
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
-                                         struct btree *b,
-                                         bool is_extents)
+                                         struct btree *b)
 {
        struct bset_tree *t;
 
-       __bch2_btree_node_iter_init(iter, is_extents);
+       memset(iter, 0, sizeof(*iter));
 
        for_each_bset(b, t)
                __bch2_btree_node_iter_push(iter, b,
@@ -1598,7 +1590,7 @@ static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
 {
        bool ret;
 
-       if ((ret = (btree_node_iter_cmp(iter, b,
+       if ((ret = (btree_node_iter_cmp(b,
                                        iter->data[first],
                                        iter->data[first + 1]) > 0)))
                swap(iter->data[first], iter->data[first + 1]);
@@ -1653,23 +1645,14 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
        btree_node_iter_sort_two(iter, b, 1);
 }
 
-/**
- * bch_btree_node_iter_advance - advance @iter by one key
- *
- * Doesn't do debugchecks - for cases where (insert_fixup_extent()) a bset might
- * momentarily have out of order extents.
- */
 void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
                                  struct btree *b)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct bkey_packed *k = bch2_btree_node_iter_peek_all(iter, b);
-
-       __bch2_btree_node_iter_advance(iter, b);
-       bch2_btree_node_iter_next_check(iter, b, k);
-#else
-       __bch2_btree_node_iter_advance(iter, b);
+       bch2_btree_node_iter_verify(iter, b);
+       bch2_btree_node_iter_next_check(iter, b);
 #endif
+       __bch2_btree_node_iter_advance(iter, b);
 }
 
 static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
@@ -1702,8 +1685,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
                        bch2_btree_node_iter_bset_pos(iter, b, t),
                        min_key_type);
                if (k &&
-                   (!prev || __btree_node_iter_cmp(iter->is_extents, b,
-                                                   k, prev) > 0)) {
+                   (!prev || __btree_node_iter_cmp(b, k, prev) > 0)) {
                        prev = k;
                        end = t->end_offset;
                }
@@ -1736,11 +1718,11 @@ out:
                struct btree_node_iter iter2 = *iter;
 
                if (prev)
-                       bch2_btree_node_iter_advance(&iter2, b);
+                       __bch2_btree_node_iter_advance(&iter2, b);
 
                while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
                        BUG_ON(k->type >= min_key_type);
-                       bch2_btree_node_iter_advance(&iter2, b);
+                       __bch2_btree_node_iter_advance(&iter2, b);
                }
        }
 
index 296c05b4f07a1fd675014f21987144e2a35d5e61..3a0ee491d2194c1bfddb5ccccd4445f6733d59f7 100644 (file)
@@ -342,8 +342,7 @@ void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
                         struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
-                                 struct bkey_packed *);
+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
 
 void bch2_bset_insert(struct btree *, struct btree_node_iter *,
                     struct bkey_packed *, struct bkey_i *, unsigned);
@@ -368,6 +367,17 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b,
        return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
 }
 
+/* Returns true if @k is after iterator position @pos */
+static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
+                                     const struct bkey *k)
+{
+       int cmp = bkey_cmp(k->p, iter->pos);
+
+       return cmp > 0 ||
+               (cmp == 0 &&
+                !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
+}
+
 /* Returns true if @k is after iterator position @pos */
 static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
                                             struct bpos *pos,
@@ -418,7 +428,7 @@ enum bch_extent_overlap {
 
 /* Returns how k overlaps with m */
 static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
-                                                        const struct bkey *m)
+                                                         const struct bkey *m)
 {
        int cmp1 = bkey_cmp(k->p, m->p) < 0;
        int cmp2 = bkey_cmp(bkey_start_pos(k),
@@ -429,20 +439,13 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
 
 /* Btree key iteration */
 
-static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
-                                             bool is_extents)
-{
-       iter->is_extents = is_extents;
-       memset(iter->data, 0, sizeof(iter->data));
-}
-
 void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
                              const struct bkey_packed *,
                              const struct bkey_packed *);
 void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
-                             struct bpos, bool, bool);
+                              struct bpos, bool);
 void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
-                                        struct btree *, bool);
+                                         struct btree *);
 struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
                                                 struct btree *,
                                                 struct bset_tree *);
@@ -469,32 +472,21 @@ static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
        return __btree_node_iter_set_end(iter, 0);
 }
 
-static inline int __btree_node_iter_cmp(bool is_extents,
-                                       struct btree *b,
-                                       struct bkey_packed *l,
-                                       struct bkey_packed *r)
+static inline int __btree_node_iter_cmp(struct btree *b,
+                                       const struct bkey_packed *l,
+                                       const struct bkey_packed *r)
 {
-       /*
-        * For non extents, when keys compare equal the deleted keys have to
-        * come first - so that bch2_btree_node_iter_next_check() can detect
-        * duplicate nondeleted keys (and possibly other reasons?)
-        *
-        * For extents, bkey_deleted() is used as a proxy for k->size == 0, so
-        * deleted keys have to sort last.
-        */
+       /* When keys compare equal deleted keys come first */
        return bkey_cmp_packed(b, l, r)
-               ?: (is_extents
-                   ? (int) bkey_deleted(l) - (int) bkey_deleted(r)
-                   : (int) bkey_deleted(r) - (int) bkey_deleted(l))
+               ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
                ?: (l > r) - (l < r);
 }
 
-static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
-                                     struct btree *b,
+static inline int btree_node_iter_cmp(struct btree *b,
                                      struct btree_node_iter_set l,
                                      struct btree_node_iter_set r)
 {
-       return __btree_node_iter_cmp(iter->is_extents, b,
+       return __btree_node_iter_cmp(b,
                        __btree_node_offset_to_key(b, l.k),
                        __btree_node_offset_to_key(b, r.k));
 }
@@ -581,21 +573,12 @@ bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
        return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
 }
 
-/*
- * Iterates over all _live_ keys - skipping deleted (and potentially
- * overlapping) keys
- */
-#define for_each_btree_node_key(b, k, iter, _is_extents)               \
-       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
-            ((k) = bch2_btree_node_iter_peek(iter, b));                        \
-            bch2_btree_node_iter_advance(iter, b))
-
 struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
                                                struct btree *,
                                                struct bkey *);
 
-#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
-       for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked)           \
+       for (bch2_btree_node_iter_init_from_start((iter), (b));         \
             (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
             bch2_btree_node_iter_advance(iter, b))
 
@@ -620,6 +603,13 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
 #define btree_keys_account_key_drop(_nr, _bset_idx, _k)        \
        btree_keys_account_key(_nr, _bset_idx, _k, -1)
 
+#define btree_account_key_add(_b, _k)                          \
+       btree_keys_account_key(&(_b)->nr,                       \
+               bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
+#define btree_account_key_drop(_b, _k)                         \
+       btree_keys_account_key(&(_b)->nr,                       \
+               bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
+
 struct bset_stats {
        struct {
                size_t nr, bytes;
@@ -645,17 +635,18 @@ void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
 
 void __bch2_verify_btree_nr_keys(struct btree *);
 void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
-                         struct bkey_packed *);
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
+                           struct bkey_packed *, unsigned);
 
 #else
 
 static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
 static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
                                              struct btree *b) {}
-static inline void bch2_verify_key_order(struct btree *b,
-                                       struct btree_node_iter *iter,
-                                       struct bkey_packed *where) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+                                         struct bkey_packed *where,
+                                         struct bkey_packed *insert,
+                                         unsigned clobber_u64s) {}
 #endif
 
 static inline void bch2_verify_btree_nr_keys(struct btree *b)
index 969c1f19414e3c52d5001af30c37e7ab90b8c6b7..7c18d8303aaa05c943bb10b1097f56805b3d0143 100644 (file)
@@ -122,13 +122,14 @@ static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 
        switch (type) {
        case BKEY_TYPE_BTREE:
-               bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, NULL,
+               bch2_mark_key(c, k, c->opts.btree_node_size,
+                             BCH_DATA_BTREE, pos, NULL,
                              0, flags|
                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
                              BCH_BUCKET_MARK_GC_LOCK_HELD);
                break;
        case BKEY_TYPE_EXTENTS:
-               bch2_mark_key(c, k, k.k->size, false, pos, NULL,
+               bch2_mark_key(c, k, k.k->size, BCH_DATA_USER, pos, NULL,
                              0, flags|
                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
                              BCH_BUCKET_MARK_GC_LOCK_HELD);
@@ -215,7 +216,6 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 
        if (btree_node_has_ptrs(b))
                for_each_btree_node_key_unpack(b, k, &iter,
-                                              btree_node_is_extents(b),
                                               &unpacked) {
                        bch2_bkey_debugcheck(c, b, k);
                        stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
@@ -324,9 +324,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
        unsigned i;
        u64 b;
 
+       /*
+        * This conditional is kind of gross, but we may be called from the
+        * device add path, before the new device has actually been added to the
+        * running filesystem:
+        */
        if (c) {
                lockdep_assert_held(&c->sb_lock);
                percpu_down_read_preempt_disable(&c->usage_lock);
+       } else {
+               preempt_disable();
        }
 
        for (i = 0; i < layout->nr_superblocks; i++) {
@@ -354,6 +361,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
        if (c) {
                percpu_up_read_preempt_enable(&c->usage_lock);
                spin_unlock(&c->journal.lock);
+       } else {
+               preempt_enable();
        }
 }
 
@@ -386,7 +395,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
        for_each_pending_btree_node_free(c, as, d)
                if (d->index_update_done)
                        bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-                                     c->opts.btree_node_size, true, pos,
+                                     c->opts.btree_node_size,
+                                     BCH_DATA_BTREE, pos,
                                      &stats, 0,
                                      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
                                      BCH_BUCKET_MARK_GC_LOCK_HELD);
@@ -479,7 +489,8 @@ static void bch2_gc_start(struct bch_fs *c)
                struct bch_fs_usage *p =
                        per_cpu_ptr(c->usage_percpu, cpu);
 
-               memset(p->s, 0, sizeof(p->s));
+               memset(p->replicas, 0, sizeof(p->replicas));
+               memset(p->buckets, 0, sizeof(p->buckets));
        }
 
        percpu_up_write(&c->usage_lock);
@@ -558,9 +569,6 @@ void bch2_gc(struct bch_fs *c)
        bch2_mark_pending_btree_node_frees(c);
        bch2_mark_allocator_buckets(c);
 
-       for_each_member_device(ca, c, i)
-               atomic_long_set(&ca->saturated_count, 0);
-
        /* Indicates that gc is no longer in progress: */
        gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        c->gc_count++;
@@ -587,15 +595,14 @@ out:
 
 static void recalc_packed_keys(struct btree *b)
 {
+       struct bset *i = btree_bset_first(b);
        struct bkey_packed *k;
 
        memset(&b->nr, 0, sizeof(b->nr));
 
        BUG_ON(b->nsets != 1);
 
-       for (k =  btree_bkey_first(b, b->set);
-            k != btree_bkey_last(b, b->set);
-            k = bkey_next(k))
+       vstruct_for_each(i, k)
                btree_keys_account_key_add(&b->nr, 0, k);
 }
 
@@ -1032,7 +1039,6 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
                        struct bkey_s_c k;
 
                        for_each_btree_node_key_unpack(b, k, &node_iter,
-                                                      btree_node_is_extents(b),
                                                       &unpacked) {
                                ret = bch2_btree_mark_key_initial(c,
                                                        btree_node_type(b), k);
index 94f56dbbeac33326c0b86ea3bcc6dda6f40d61ef..d83144b7938efb80bac16cc2906dac2342f2daaa 100644 (file)
@@ -22,7 +22,7 @@
 /* btree_node_iter_large: */
 
 #define btree_node_iter_cmp_heap(h, _l, _r)                            \
-       __btree_node_iter_cmp((iter)->is_extents, b,                    \
+       __btree_node_iter_cmp(b,                                        \
                               __btree_node_offset_to_key(b, (_l).k),   \
                               __btree_node_offset_to_key(b, (_r).k))
 
@@ -248,6 +248,9 @@ static unsigned sort_extent_whiteouts(struct bkey_packed *dst,
        sort_iter_sort(iter, sort_extent_whiteouts_cmp);
 
        while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+               if (bkey_deleted(in))
+                       continue;
+
                EBUG_ON(bkeyp_val_u64s(f, in));
                EBUG_ON(in->type != KEY_TYPE_DISCARD);
 
@@ -309,7 +312,7 @@ static unsigned should_compact_bset(struct btree *b, struct bset_tree *t,
 
        if (mode == COMPACT_LAZY) {
                if (should_compact_bset_lazy(b, t) ||
-                   (compacting && bset_unwritten(b, bset(b, t))))
+                   (compacting && !bset_written(b, bset(b, t))))
                        return dead_u64s;
        } else {
                if (bset_written(b, bset(b, t)))
@@ -356,7 +359,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
                struct bkey_packed *k, *n, *out, *start, *end;
                struct btree_node_entry *src = NULL, *dst = NULL;
 
-               if (t != b->set && bset_unwritten(b, i)) {
+               if (t != b->set && !bset_written(b, i)) {
                        src = container_of(i, struct btree_node_entry, keys);
                        dst = max(write_block(b),
                                  (void *) btree_bkey_last(b, t -1));
@@ -396,7 +399,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
                                continue;
 
                        if (bkey_whiteout(k)) {
-                               unreserve_whiteout(b, t, k);
+                               unreserve_whiteout(b, k);
                                memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
                                set_bkeyp_val_u64s(f, u_pos, 0);
                                u_pos = bkey_next(u_pos);
@@ -467,7 +470,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
                start   = btree_bkey_first(b, t);
                end     = btree_bkey_last(b, t);
 
-               if (bset_unwritten(b, i) &&
+               if (!bset_written(b, i) &&
                    t != b->set) {
                        struct bset *dst =
                               max_t(struct bset *, write_block(b),
@@ -785,8 +788,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
        bch2_bset_set_no_aux_tree(dst, dst->set);
 
-       bch2_btree_node_iter_init_from_start(&src_iter, src,
-                                           btree_node_is_extents(src));
+       bch2_btree_node_iter_init_from_start(&src_iter, src);
 
        if (btree_node_ops(src)->key_normalize ||
            btree_node_ops(src)->key_merge)
@@ -829,7 +831,7 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b,
        for (unwritten_idx = 0;
             unwritten_idx < b->nsets;
             unwritten_idx++)
-               if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+               if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
                        break;
 
        if (b->nsets - unwritten_idx > 1) {
@@ -852,7 +854,7 @@ void bch2_btree_build_aux_trees(struct btree *b)
 
        for_each_bset(b, t)
                bch2_bset_build_aux_tree(b, t,
-                               bset_unwritten(b, bset(b, t)) &&
+                               !bset_written(b, bset(b, t)) &&
                                t == bset_tree_last(b));
 }
 
@@ -1171,7 +1173,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
        int ret, retry_read = 0, write = READ;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
-       __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+       iter->used = 0;
 
        if (bch2_meta_read_fault("btree"))
                btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
@@ -1945,9 +1947,9 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
        clear_btree_node_just_written(b);
 
        /*
-        * Note: immediately after write, bset_unwritten()/bset_written() don't
-        * work - the amount of data we had to write after compaction might have
-        * been smaller than the offset of the last bset.
+        * Note: immediately after write, bset_written() doesn't work - the
+        * amount of data we had to write after compaction might have been
+        * smaller than the offset of the last bset.
         *
         * However, we know that all bsets have been written here, as long as
         * we're still holding the write lock:
index fa154642515128bae9d2dd9645fbf347572f9b72..ccd47326d16bbd739abee3a1dda9430072fcde6e 100644 (file)
@@ -145,20 +145,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
 /* Sorting */
 
 struct btree_node_iter_large {
-       u8              is_extents;
        u16             used;
 
        struct btree_node_iter_set data[MAX_BSETS];
 };
 
-static inline void
-__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
-                                 bool is_extents)
-{
-       iter->used = 0;
-       iter->is_extents = is_extents;
-}
-
 void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
                                        struct btree *);
 
index a52ec12e9058d88b7c64cfdfd8f1498e6ef9b1a2..c37d82ae258bd872bcfa56cbab97d8229ad15cf4 100644 (file)
@@ -34,10 +34,10 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
        struct btree_iter *linked;
 
        EBUG_ON(iter->l[b->level].b != b);
-       EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
+       EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
 
        for_each_btree_iter_with_node(iter, b, linked)
-               linked->lock_seq[b->level] += 2;
+               linked->l[b->level].lock_seq += 2;
 
        six_unlock_write(&b->lock);
 }
@@ -68,26 +68,6 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
                     &b->lock.state.counter);
 }
 
-/*
- * Lock a btree node if we already have it locked on one of our linked
- * iterators:
- */
-static inline bool btree_node_lock_increment(struct btree_iter *iter,
-                                            struct btree *b, unsigned level,
-                                            enum btree_node_locked_type want)
-{
-       struct btree_iter *linked;
-
-       for_each_linked_btree_iter(iter, linked)
-               if (linked->l[level].b == b &&
-                   btree_node_locked_type(linked, level) >= want) {
-                       six_lock_increment(&b->lock, want);
-                       return true;
-               }
-
-       return false;
-}
-
 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
        struct btree *b = btree_iter_node(iter, level);
@@ -99,8 +79,8 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
        if (race_fault())
                return false;
 
-       if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
-           !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+       if (!six_relock_type(&b->lock, want, iter->l[level].lock_seq) &&
+           !(iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
              btree_node_lock_increment(iter, b, level, want)))
                return false;
 
@@ -125,10 +105,10 @@ static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
 
        if (btree_node_locked(iter, level)
            ? six_lock_tryupgrade(&b->lock)
-           : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
+           : six_relock_type(&b->lock, SIX_LOCK_intent, iter->l[level].lock_seq))
                goto success;
 
-       if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+       if (iter->l[level].lock_seq >> 1 == b->lock.state.seq >> 1 &&
            btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
                btree_node_unlock(iter, level);
                goto success;
@@ -189,34 +169,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
        struct btree_iter *linked;
        bool ret = true;
 
-       /* Can't have children locked before ancestors: */
-       EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
-
-       /*
-        * Can't hold any read locks while we block taking an intent lock - see
-        * below for reasoning, and we should have already dropped any read
-        * locks in the current iterator
-        */
-       EBUG_ON(type == SIX_LOCK_intent &&
-               iter->nodes_locked != iter->nodes_intent_locked);
-
-       if (btree_node_lock_increment(iter, b, level, type))
-               return true;
-
-       /*
-        * Must lock btree nodes in key order - this case happens when locking
-        * the prev sibling in btree node merging:
-        */
-       if (iter->nodes_locked &&
-           __ffs(iter->nodes_locked) <= level &&
-           __btree_iter_cmp(iter->btree_id, pos, iter))
-               return false;
-
-       for_each_linked_btree_iter(iter, linked) {
+       /* Check if it's safe to block: */
+       for_each_btree_iter(iter, linked) {
                if (!linked->nodes_locked)
                        continue;
 
-               /* We have to lock btree nodes in key order: */
+               /* * Must lock btree nodes in key order: */
                if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
                        ret = false;
 
@@ -251,9 +209,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                if (linked->btree_id == iter->btree_id &&
                    level > __fls(linked->nodes_locked)) {
                        if (may_drop_locks) {
-                               linked->locks_want = max_t(unsigned,
-                                                          linked->locks_want,
-                                                          iter->locks_want);
+                               linked->locks_want =
+                                       max(level + 1, max_t(unsigned,
+                                           linked->locks_want,
+                                           iter->locks_want));
                                btree_iter_get_locks(linked, true);
                        }
                        ret = false;
@@ -415,14 +374,20 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
        struct btree_node_iter tmp = l->iter;
        struct bkey_packed *k;
 
+       if (iter->uptodate > BTREE_ITER_NEED_PEEK)
+               return;
+
        bch2_btree_node_iter_verify(&l->iter, b);
 
        /*
         * For interior nodes, the iterator will have skipped past
         * deleted keys:
+        *
+        * For extents, the iterator may have skipped past deleted keys (but not
+        * whiteouts)
         */
-       k = b->level
-               ? bch2_btree_node_iter_prev(&tmp, b)
+       k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+               ? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_DISCARD)
                : bch2_btree_node_iter_prev_all(&tmp, b);
        if (k && btree_iter_pos_cmp_packed(b, &iter->pos, k,
                                iter->flags & BTREE_ITER_IS_EXTENTS)) {
@@ -430,7 +395,7 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
                struct bkey uk = bkey_unpack_key(b, k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &uk);
-               panic("prev key should be before after pos:\n%s\n%llu:%llu\n",
+               panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
                      buf, iter->pos.inode, iter->pos.offset);
        }
 
@@ -441,15 +406,16 @@ static void __bch2_btree_iter_verify(struct btree_iter *iter,
                struct bkey uk = bkey_unpack_key(b, k);
 
                bch2_bkey_to_text(buf, sizeof(buf), &uk);
-               panic("next key should be before iter pos:\n%llu:%llu\n%s\n",
+               panic("iter should be after current key:\n"
+                     "iter pos %llu:%llu\n"
+                     "cur key  %s\n",
                      iter->pos.inode, iter->pos.offset, buf);
        }
 
-       if (iter->uptodate == BTREE_ITER_UPTODATE &&
-           (iter->flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES) {
-               BUG_ON(!bkey_whiteout(&iter->k) &&
-                      bch2_btree_node_iter_end(&l->iter));
-       }
+       BUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
+              (iter->flags & BTREE_ITER_TYPE) == BTREE_ITER_KEYS &&
+              !bkey_whiteout(&iter->k) &&
+              bch2_btree_node_iter_end(&l->iter));
 }
 
 void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
@@ -460,6 +426,11 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
                __bch2_btree_iter_verify(linked, b);
 }
 
+#else
+
+static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
+                                           struct btree *b) {}
+
 #endif
 
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -474,7 +445,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
        struct btree_node_iter_set *set;
        unsigned offset = __btree_node_key_to_offset(b, where);
        int shift = new_u64s - clobber_u64s;
-       unsigned old_end = (int) __btree_node_key_to_offset(b, end) - shift;
+       unsigned old_end = t->end_offset - shift;
 
        btree_node_iter_for_each(node_iter, set)
                if (set->end == old_end)
@@ -496,7 +467,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
        }
        return;
 found:
-       set->end = (int) set->end + shift;
+       set->end = t->end_offset;
 
        /* Iterator hasn't gotten to the key that changed yet: */
        if (set->k < offset)
@@ -557,8 +528,7 @@ iter_current_key_not_modified:
                        k = bch2_bkey_prev_all(b, t,
                                bch2_btree_node_iter_bset_pos(node_iter, b, t));
                        if (k &&
-                           __btree_node_iter_cmp(node_iter, b,
-                                                 k, where) > 0) {
+                           __btree_node_iter_cmp(b, k, where) > 0) {
                                struct btree_node_iter_set *set;
                                unsigned offset =
                                        __btree_node_key_to_offset(b, bkey_next(k));
@@ -580,13 +550,13 @@ next_bset:
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
-                            struct btree *b,
-                            struct btree_node_iter *node_iter,
-                            struct bset_tree *t,
-                            struct bkey_packed *where,
-                            unsigned clobber_u64s,
-                            unsigned new_u64s)
+                             struct btree *b,
+                             struct btree_node_iter *node_iter,
+                             struct bkey_packed *where,
+                             unsigned clobber_u64s,
+                             unsigned new_u64s)
 {
+       struct bset_tree *t = bch2_bkey_to_bset(b, where);
        struct btree_iter *linked;
 
        if (node_iter != &iter->l[b->level].iter)
@@ -597,10 +567,6 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                __bch2_btree_node_iter_fix(linked, b,
                                          &linked->l[b->level].iter, t,
                                          where, clobber_u64s, new_u64s);
-
-       /* interior node iterators are... special... */
-       if (!b->level)
-               bch2_btree_iter_verify(iter, b);
 }
 
 static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
@@ -687,17 +653,6 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                btree_node_unlock(iter, b->level + 1);
 }
 
-/* Returns true if @k is after iterator position @pos */
-static inline bool btree_iter_pos_cmp(struct btree_iter *iter,
-                                     const struct bkey *k)
-{
-       int cmp = bkey_cmp(k->p, iter->pos);
-
-       return cmp > 0 ||
-               (cmp == 0 &&
-                !(iter->flags & BTREE_ITER_IS_EXTENTS) && !bkey_deleted(k));
-}
-
 static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
                                             struct btree *b)
 {
@@ -719,8 +674,7 @@ static inline void __btree_iter_init(struct btree_iter *iter,
        struct btree_iter_level *l = &iter->l[b->level];
 
        bch2_btree_node_iter_init(&l->iter, b, iter->pos,
-                                 iter->flags & BTREE_ITER_IS_EXTENTS,
-                                 btree_node_is_extents(b));
+                                 iter->flags & BTREE_ITER_IS_EXTENTS);
 
        /* Skip to first non whiteout: */
        if (b->level)
@@ -737,7 +691,7 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
        EBUG_ON(!btree_iter_pos_in_node(iter, b));
        EBUG_ON(b->lock.state.seq & 1);
 
-       iter->lock_seq[b->level] = b->lock.state.seq;
+       iter->l[b->level].lock_seq = b->lock.state.seq;
        iter->l[b->level].b = b;
        __btree_iter_init(iter, b);
 }
@@ -1020,8 +974,6 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        if (__bch2_btree_iter_relock(iter))
                return 0;
 
-       iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
-
        /*
         * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
         * here unnecessary
@@ -1062,7 +1014,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        }
 
        iter->uptodate = BTREE_ITER_NEED_PEEK;
+
        bch2_btree_iter_verify_locks(iter);
+       __bch2_btree_iter_verify(iter, iter->l[iter->level].b);
        return 0;
 }
 
@@ -1083,7 +1037,6 @@ static inline void bch2_btree_iter_checks(struct btree_iter *iter,
                                          enum btree_iter_type type)
 {
        EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-       EBUG_ON((iter->flags & BTREE_ITER_TYPE) != type);
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS &&
                 type != BTREE_ITER_NODES));
@@ -1199,10 +1152,8 @@ void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_
                                          iter->flags & BTREE_ITER_IS_EXTENTS))
                __btree_iter_advance(l);
 
-       if (!k && btree_iter_pos_after_node(iter, l->b)) {
+       if (!k && btree_iter_pos_after_node(iter, l->b))
                btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               iter->flags |= BTREE_ITER_AT_END_OF_LEAF;
-       }
 }
 
 void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
@@ -1403,9 +1354,10 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 }
 
 static inline struct bkey_s_c
-__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 {
        struct btree_iter_level *l = &iter->l[0];
+       struct btree_node_iter node_iter;
        struct bkey_s_c k;
        struct bkey n;
        int ret;
@@ -1416,6 +1368,17 @@ recheck:
               bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
                __btree_iter_advance(l);
 
+       /*
+        * iterator is now at the correct position for inserting at iter->pos,
+        * but we need to keep iterating until we find the first non whiteout so
+        * we know how big a hole we have, if any:
+        */
+
+       node_iter = l->iter;
+       if (k.k && bkey_whiteout(k.k))
+               k = __btree_iter_unpack(iter, l, &iter->k,
+                       bch2_btree_node_iter_peek(&node_iter, l->b));
+
        /*
         * If we got to the end of the node, check if we need to traverse to the
         * next node:
@@ -1432,6 +1395,13 @@ recheck:
        if (k.k &&
            !bkey_whiteout(k.k) &&
            bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+               /*
+                * if we skipped forward to find the first non whiteout and
+                * there _wasn't_ actually a hole, we want the iterator to be
+                * pointed at the key we found:
+                */
+               l->iter = node_iter;
+
                EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
                EBUG_ON(bkey_deleted(k.k));
                iter->uptodate = BTREE_ITER_UPTODATE;
@@ -1439,41 +1409,88 @@ recheck:
        }
 
        /* hole */
+
+       /* holes can't span inode numbers: */
+       if (iter->pos.offset == KEY_OFFSET_MAX) {
+               if (iter->pos.inode == KEY_INODE_MAX)
+                       return bkey_s_c_null;
+
+               iter->pos = bkey_successor(iter->pos);
+               goto recheck;
+       }
+
+       if (!k.k)
+               k.k = &l->b->key.k;
+
        bkey_init(&n);
        n.p = iter->pos;
+       bch2_key_resize(&n,
+                       min_t(u64, KEY_SIZE_MAX,
+                             (k.k->p.inode == n.p.inode
+                              ? bkey_start_offset(k.k)
+                              : KEY_OFFSET_MAX) -
+                             n.p.offset));
+
+       //EBUG_ON(!n.size);
+       if (!n.size) {
+               char buf[100];
+               bch2_dump_btree_node(iter->l[0].b);
+
+               bch2_bkey_to_text(buf, sizeof(buf), k.k);
+               panic("iter at %llu:%llu\n"
+                     "next key %s\n",
+                     iter->pos.inode,
+                     iter->pos.offset,
+                     buf);
+       }
 
-       if (iter->flags & BTREE_ITER_IS_EXTENTS) {
-               if (n.p.offset == KEY_OFFSET_MAX) {
-                       if (n.p.inode == KEY_INODE_MAX)
-                               return bkey_s_c_null;
-
-                       iter->pos = bkey_successor(iter->pos);
-                       goto recheck;
-               }
+       iter->k = n;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+       return (struct bkey_s_c) { &iter->k, NULL };
+}
 
-               if (k.k && bkey_whiteout(k.k)) {
-                       struct btree_node_iter node_iter = l->iter;
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_slot(struct btree_iter *iter)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_s_c k;
+       int ret;
 
-                       k = __btree_iter_unpack(iter, l, &iter->k,
-                               bch2_btree_node_iter_peek(&node_iter, l->b));
-               }
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return __bch2_btree_iter_peek_slot_extents(iter);
 
-               if (!k.k)
-                       k.k = &l->b->key.k;
+recheck:
+       while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
+              bkey_deleted(k.k) &&
+              bkey_cmp(k.k->p, iter->pos) == 0)
+               __btree_iter_advance(l);
 
-               bch2_key_resize(&n,
-                               min_t(u64, KEY_SIZE_MAX,
-                                     (k.k->p.inode == n.p.inode
-                                      ? bkey_start_offset(k.k)
-                                      : KEY_OFFSET_MAX) -
-                                     n.p.offset));
+       /*
+        * If we got to the end of the node, check if we need to traverse to the
+        * next node:
+        */
+       if (unlikely(!k.k && btree_iter_pos_after_node(iter, l->b))) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               ret = bch2_btree_iter_traverse(iter);
+               if (unlikely(ret))
+                       return bkey_s_c_err(ret);
 
-               EBUG_ON(!n.size);
+               goto recheck;
        }
 
-       iter->k = n;
-       iter->uptodate = BTREE_ITER_UPTODATE;
-       return (struct bkey_s_c) { &iter->k, NULL };
+       if (k.k &&
+           !bkey_deleted(k.k) &&
+           !bkey_cmp(iter->pos, k.k->p)) {
+               iter->uptodate = BTREE_ITER_UPTODATE;
+               return k;
+       } else {
+               /* hole */
+               bkey_init(&iter->k);
+               iter->k.p = iter->pos;
+
+               iter->uptodate = BTREE_ITER_UPTODATE;
+               return (struct bkey_s_c) { &iter->k, NULL };
+       }
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
@@ -1611,17 +1628,29 @@ static void btree_trans_verify(struct btree_trans *trans)
        }
 }
 
+static inline unsigned btree_trans_iter_idx(struct btree_trans *trans,
+                                           struct btree_iter *iter)
+{
+       ssize_t idx = iter - trans->iters;
+
+       BUG_ON(idx < 0 || idx >= trans->nr_iters);
+       BUG_ON(!(trans->iters_live & (1U << idx)));
+
+       return idx;
+}
+
+void bch2_trans_iter_put(struct btree_trans *trans,
+                        struct btree_iter *iter)
+{
+       ssize_t idx = btree_trans_iter_idx(trans, iter);
+
+       trans->iters_live       &= ~(1U << idx);
+}
+
 void bch2_trans_iter_free(struct btree_trans *trans,
                          struct btree_iter *iter)
 {
-       unsigned idx;
-
-       for (idx = 0; idx < trans->nr_iters; idx++)
-               if (&trans->iters[idx] == iter)
-                       goto found;
-       BUG();
-found:
-       BUG_ON(!(trans->iters_linked & (1U << idx)));
+       ssize_t idx = btree_trans_iter_idx(trans, iter);
 
        trans->iters_live       &= ~(1U << idx);
        trans->iters_linked     &= ~(1U << idx);
@@ -1635,10 +1664,7 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
 
        bch2_trans_unlock(trans);
 
-       new_iters = kmalloc(sizeof(struct btree_iter) * BTREE_ITER_MAX,
-                           GFP_NOFS);
-       if (!new_iters)
-               return -ENOMEM;
+       new_iters = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
 
        memcpy(new_iters, trans->iters,
               sizeof(struct btree_iter) * trans->nr_iters);
@@ -1666,12 +1692,10 @@ static int btree_trans_realloc_iters(struct btree_trans *trans)
        return 0;
 }
 
-int bch2_trans_preload_iters(struct btree_trans *trans)
+void bch2_trans_preload_iters(struct btree_trans *trans)
 {
-       if (trans->iters != trans->iters_onstack)
-               return 0;
-
-       return btree_trans_realloc_iters(trans);
+       if (trans->iters == trans->iters_onstack)
+               btree_trans_realloc_iters(trans);
 }
 
 static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
@@ -1711,10 +1735,6 @@ got_slot:
        } else {
                iter = &trans->iters[idx];
 
-               BUG_ON(iter->btree_id != btree_id);
-               BUG_ON((iter->flags ^ flags) &
-                      (BTREE_ITER_SLOTS|BTREE_ITER_IS_EXTENTS));
-
                iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
                iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
        }
@@ -1731,6 +1751,9 @@ got_slot:
 
        btree_trans_verify(trans);
 
+       BUG_ON(iter->btree_id != btree_id);
+       BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
+
        return iter;
 }
 
@@ -1855,7 +1878,7 @@ int bch2_trans_exit(struct btree_trans *trans)
 
        kfree(trans->mem);
        if (trans->iters != trans->iters_onstack)
-               kfree(trans->iters);
+               mempool_free(trans->iters, &trans->c->btree_iters_pool);
        trans->mem      = (void *) 0x1;
        trans->iters    = (void *) 0x1;
        return ret;
index d046ad71a7ba21213b283709a58b6f2380bd8176..1a1ca952c7e5f130668dfd7ec33e3db4157b9e88 100644 (file)
@@ -40,7 +40,7 @@ static inline bool __iter_has_node(const struct btree_iter *iter,
         */
 
        return iter->l[b->level].b == b &&
-               iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+               iter->l[b->level].lock_seq >> 1 == b->lock.state.seq >> 1;
 }
 
 static inline struct btree_iter *
@@ -100,8 +100,8 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-                            struct btree_node_iter *, struct bset_tree *,
-                            struct bkey_packed *, unsigned, unsigned);
+                             struct btree_node_iter *, struct bkey_packed *,
+                             unsigned, unsigned);
 
 int bch2_btree_iter_unlock(struct btree_iter *);
 
@@ -271,9 +271,9 @@ static inline int btree_iter_err(struct bkey_s_c k)
 
 /* new multiple iterator interface: */
 
-int bch2_trans_preload_iters(struct btree_trans *);
-void bch2_trans_iter_free(struct btree_trans *,
-                               struct btree_iter *);
+void bch2_trans_preload_iters(struct btree_trans *);
+void bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
 
 struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
                                         struct bpos, unsigned, u64);
@@ -308,6 +308,11 @@ bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
 
 void __bch2_trans_begin(struct btree_trans *);
 
+static inline void bch2_trans_begin_updates(struct btree_trans *trans)
+{
+       trans->nr_updates = 0;
+}
+
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
 int bch2_trans_unlock(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *);
index 419d0e815a2527d77cf3dcab10b35d1b5413cf9a..9bbed99eb6acbcd5fdf2ee827c158c6b150e912d 100644 (file)
@@ -146,6 +146,26 @@ static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
                __btree_node_lock_type(c, b, type);
 }
 
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+                                            struct btree *b, unsigned level,
+                                            enum btree_node_locked_type want)
+{
+       struct btree_iter *linked;
+
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->l[level].b == b &&
+                   btree_node_locked_type(linked, level) >= want) {
+                       six_lock_increment(&b->lock, want);
+                       return true;
+               }
+
+       return false;
+}
+
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
                            struct btree_iter *, enum six_lock_type, bool);
 
@@ -158,6 +178,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
        return likely(six_trylock_type(&b->lock, type)) ||
+               btree_node_lock_increment(iter, b, level, type) ||
                __bch2_btree_node_lock(b, pos, level, iter,
                                       type, may_drop_locks);
 }
@@ -184,7 +205,7 @@ void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
 static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
        EBUG_ON(iter->l[b->level].b != b);
-       EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+       EBUG_ON(iter->l[b->level].lock_seq != b->lock.state.seq);
 
        if (!six_trylock_write(&b->lock))
                __bch2_btree_node_lock_write(b, iter);
index 39e2db757f9a92f7555faa5f953a72528059d600..5f137af4da53e23b8ec6b05f3d6baec1e89cb291 100644 (file)
@@ -175,8 +175,6 @@ struct btree_cache {
 };
 
 struct btree_node_iter {
-       u8              is_extents;
-
        struct btree_node_iter_set {
                u16     k, end;
        } data[MAX_BSETS];
@@ -197,11 +195,7 @@ enum btree_iter_type {
  * @pos or the first key strictly greater than @pos
  */
 #define BTREE_ITER_IS_EXTENTS          (1 << 4)
-/*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
- */
-#define BTREE_ITER_AT_END_OF_LEAF      (1 << 5)
-#define BTREE_ITER_ERROR               (1 << 6)
+#define BTREE_ITER_ERROR               (1 << 5)
 
 enum btree_iter_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -232,10 +226,9 @@ struct btree_iter {
        struct btree_iter_level {
                struct btree    *b;
                struct btree_node_iter iter;
+               u32             lock_seq;
        }                       l[BTREE_MAX_DEPTH];
 
-       u32                     lock_seq[BTREE_MAX_DEPTH];
-
        /*
         * Current unpacked key - so that bch2_btree_iter_next()/
         * bch2_btree_iter_next_slot() can correctly advance pos.
@@ -258,12 +251,6 @@ struct btree_iter {
 struct btree_insert_entry {
        struct btree_iter *iter;
        struct bkey_i   *k;
-       unsigned        extra_res;
-       /*
-        * true if entire key was inserted - can only be false for
-        * extents
-        */
-       bool            done;
 };
 
 struct btree_trans {
@@ -339,10 +326,38 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
        return b->set + b->nsets - 1;
 }
 
+static inline void *
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
+{
+       return (void *) ((u64 *) b->data + 1 + offset);
+}
+
+static inline u16
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
+{
+       u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+
+       EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
+       return ret;
+}
+
 static inline struct bset *bset(const struct btree *b,
                                const struct bset_tree *t)
 {
-       return (void *) b->data + t->data_offset * sizeof(u64);
+       return __btree_node_offset_to_ptr(b, t->data_offset);
+}
+
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
+{
+       t->end_offset =
+               __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
+}
+
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
+                                 const struct bset *i)
+{
+       t->data_offset = __btree_node_ptr_to_offset(b, i);
+       set_btree_bset_end(b, t);
 }
 
 static inline struct bset *btree_bset_first(struct btree *b)
@@ -358,19 +373,27 @@ static inline struct bset *btree_bset_last(struct btree *b)
 static inline u16
 __btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
 {
-       size_t ret = (u64 *) k - (u64 *) b->data - 1;
-
-       EBUG_ON(ret > U16_MAX);
-       return ret;
+       return __btree_node_ptr_to_offset(b, k);
 }
 
 static inline struct bkey_packed *
 __btree_node_offset_to_key(const struct btree *b, u16 k)
 {
-       return (void *) ((u64 *) b->data + k + 1);
+       return __btree_node_offset_to_ptr(b, k);
 }
 
-#define btree_bkey_first(_b, _t)       (bset(_b, _t)->start)
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
+{
+       return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
+}
+
+#define btree_bkey_first(_b, _t)                                       \
+({                                                                     \
+       EBUG_ON(bset(_b, _t)->start !=                                  \
+               __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
+                                                                       \
+       bset(_b, _t)->start;                                            \
+})
 
 #define btree_bkey_last(_b, _t)                                                \
 ({                                                                     \
@@ -380,23 +403,6 @@ __btree_node_offset_to_key(const struct btree *b, u16 k)
        __btree_node_offset_to_key(_b, (_t)->end_offset);               \
 })
 
-static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
-{
-       t->end_offset =
-               __btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
-       btree_bkey_last(b, t);
-}
-
-static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
-                                 const struct bset *i)
-{
-       t->data_offset = (u64 *) i - (u64 *) b->data;
-
-       EBUG_ON(bset(b, t) != i);
-
-       set_btree_bset_end(b, t);
-}
-
 static inline unsigned bset_byte_offset(struct btree *b, void *i)
 {
        return i - (void *) b->data;
@@ -439,28 +445,17 @@ struct btree_root {
  * we're holding the write lock and we know what key is about to be overwritten:
  */
 
-struct btree_iter;
-struct btree_node_iter;
-
 enum btree_insert_ret {
        BTREE_INSERT_OK,
        /* extent spanned multiple leaf nodes: have to traverse to next node: */
        BTREE_INSERT_NEED_TRAVERSE,
        /* write lock held for too long */
-       BTREE_INSERT_NEED_RESCHED,
        /* leaf node needs to be split */
        BTREE_INSERT_BTREE_NODE_FULL,
-       BTREE_INSERT_JOURNAL_RES_FULL,
        BTREE_INSERT_ENOSPC,
        BTREE_INSERT_NEED_GC_LOCK,
 };
 
-struct extent_insert_hook {
-       enum btree_insert_ret
-       (*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
-             struct bkey_s_c, const struct bkey_i *);
-};
-
 enum btree_gc_coalesce_fail_reason {
        BTREE_GC_COALESCE_FAIL_RESERVE_GET,
        BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
index 5e47d4cd7c48727131437ff035a684a8e8a09b52..882e1c277c513ae44057f1994bac4d2f409f93f8 100644 (file)
@@ -22,7 +22,6 @@ struct btree_insert {
        struct disk_reservation *disk_res;
        struct journal_res      journal_res;
        u64                     *journal_seq;
-       struct extent_insert_hook *hook;
        unsigned                flags;
        bool                    did_work;
 
@@ -32,22 +31,10 @@ struct btree_insert {
 
 int __bch2_btree_insert_at(struct btree_insert *);
 
-#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...)   N
-#define COUNT_ARGS(...)  _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
-
 #define BTREE_INSERT_ENTRY(_iter, _k)                                  \
        ((struct btree_insert_entry) {                                  \
                .iter           = (_iter),                              \
                .k              = (_k),                                 \
-               .done           = false,                                \
-       })
-
-#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra)                        \
-       ((struct btree_insert_entry) {                                  \
-               .iter           = (_iter),                              \
-               .k              = (_k),                                 \
-               .extra_res = (_extra),                                  \
-               .done           = false,                                \
        })
 
 /**
@@ -63,13 +50,11 @@ int __bch2_btree_insert_at(struct btree_insert *);
  * -EROFS: filesystem read only
  * -EIO: journal or btree node IO error
  */
-#define bch2_btree_insert_at(_c, _disk_res, _hook,                     \
-                           _journal_seq, _flags, ...)                  \
+#define bch2_btree_insert_at(_c, _disk_res, _journal_seq, _flags, ...) \
        __bch2_btree_insert_at(&(struct btree_insert) {                 \
                .c              = (_c),                                 \
                .disk_res       = (_disk_res),                          \
                .journal_seq    = (_journal_seq),                       \
-               .hook           = (_hook),                              \
                .flags          = (_flags),                             \
                .nr             = COUNT_ARGS(__VA_ARGS__),              \
                .entries        = (struct btree_insert_entry[]) {       \
@@ -123,17 +108,13 @@ enum {
 int bch2_btree_delete_at(struct btree_iter *, unsigned);
 
 int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
-                            struct disk_reservation *,
-                            struct extent_insert_hook *, u64 *, unsigned);
+                            struct disk_reservation *, u64 *, unsigned);
 
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-                    struct disk_reservation *,
-                    struct extent_insert_hook *, u64 *, int flags);
+                    struct disk_reservation *, u64 *, int flags);
 
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-                          struct bpos, struct bpos, struct bversion,
-                          struct disk_reservation *,
-                          struct extent_insert_hook *, u64 *);
+                           struct bpos, struct bpos, u64 *);
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
                            __le64, unsigned);
@@ -142,11 +123,17 @@ int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
 
 /* new transactional interface: */
 
-void bch2_trans_update(struct btree_trans *, struct btree_iter *,
-                            struct bkey_i *, unsigned);
+static inline void
+bch2_trans_update(struct btree_trans *trans,
+                 struct btree_insert_entry entry)
+{
+       BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
+
+       trans->updates[trans->nr_updates++] = entry;
+}
+
 int bch2_trans_commit(struct btree_trans *,
                      struct disk_reservation *,
-                     struct extent_insert_hook *,
                      u64 *, unsigned);
 
 #define bch2_trans_do(_c, _journal_seq, _flags, _do)                   \
@@ -159,7 +146,7 @@ int bch2_trans_commit(struct btree_trans *,
        do {                                                            \
                bch2_trans_begin(&trans);                               \
                                                                        \
-               _ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL,   \
+               _ret = (_do) ?: bch2_trans_commit(&trans, NULL,         \
                                        (_journal_seq), (_flags));      \
        } while (_ret == -EINTR);                                       \
                                                                        \
index 392ee0a0659294720f1b75f276fc1e615abee787..a6832ef7ec7155e4522c070395863f83afe8da49 100644 (file)
@@ -34,7 +34,7 @@ static void btree_node_interior_verify(struct btree *b)
 
        BUG_ON(!b->level);
 
-       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false);
+       bch2_btree_node_iter_init(&iter, b, b->key.k.p, false);
 #if 1
        BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) ||
               bkey_cmp_left_packed(b, k, &b->key.k.p));
@@ -183,7 +183,8 @@ found:
         */
        replicas = bch2_extent_nr_dirty_ptrs(k);
        if (replicas)
-               stats->s[replicas - 1].data[S_META] -= c->opts.btree_node_size;
+               stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
+                       c->opts.btree_node_size * replicas;
 
        /*
         * We're dropping @k from the btree, but it's still live until the
@@ -210,7 +211,7 @@ found:
                struct bch_fs_usage tmp = { 0 };
 
                bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-                            -c->opts.btree_node_size, true, b
+                            -c->opts.btree_node_size, BCH_DATA_BTREE, b
                             ? gc_pos_btree_node(b)
                             : gc_pos_btree_root(as->btree_id),
                             &tmp, 0, 0);
@@ -289,7 +290,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
        BUG_ON(!pending->index_update_done);
 
        bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                    -c->opts.btree_node_size, true,
+                    -c->opts.btree_node_size, BCH_DATA_BTREE,
                     gc_phase(GC_PHASE_PENDING_DELETE),
                     &stats, 0, 0);
        /*
@@ -578,6 +579,8 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
 
+       bch2_journal_pin_flush(&c->journal, &as->journal);
+
        BUG_ON(as->nr_new_nodes);
        BUG_ON(as->nr_pending);
 
@@ -1095,7 +1098,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
        __bch2_btree_set_root_inmem(c, b);
 
        bch2_mark_key(c, bkey_i_to_s_c(&b->key),
-                     c->opts.btree_node_size, true,
+                     c->opts.btree_node_size, BCH_DATA_BTREE,
                      gc_pos_btree_root(b->btree_id),
                      &stats, 0, 0);
 
@@ -1142,7 +1145,8 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
        struct btree *old;
 
        trace_btree_set_root(c, b);
-       BUG_ON(!b->written);
+       BUG_ON(!b->written &&
+              !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
 
        old = btree_node_root(c, b);
 
@@ -1182,7 +1186,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
        if (bkey_extent_is_data(&insert->k))
                bch2_mark_key(c, bkey_i_to_s_c(insert),
-                            c->opts.btree_node_size, true,
+                            c->opts.btree_node_size, BCH_DATA_BTREE,
                             gc_pos_btree_node(b), &stats, 0, 0);
 
        while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
@@ -1317,7 +1321,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
        BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
 
-       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false, false);
+       bch2_btree_node_iter_init(&node_iter, b, k->k.p, false);
 
        while (!bch2_keylist_empty(keys)) {
                k = bch2_keylist_front(keys);
@@ -1963,7 +1967,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                bch2_btree_node_lock_write(b, iter);
 
                bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
-                             c->opts.btree_node_size, true,
+                             c->opts.btree_node_size, BCH_DATA_BTREE,
                              gc_pos_btree_root(b->btree_id),
                              &stats, 0, 0);
                bch2_btree_node_free_index(as, NULL,
@@ -2150,7 +2154,7 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
                                 as->mode,
                                 as->nodes_written,
                                 atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
-                                bch2_journal_pin_seq(&c->journal, &as->journal));
+                                as->journal.seq);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return out - buf;
index e6f050718586b1dcd9a143863c3d725c86f60cad..fa30809d50f4364ae174b44bb053ddef0abafe0c 100644 (file)
@@ -160,15 +160,6 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
 {
        struct btree *b;
 
-       /*
-        * iterators are inconsistent when they hit end of leaf, until
-        * traversed again
-        *
-        * XXX inconsistent how?
-        */
-       if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
-               return;
-
        if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
                return;
 
@@ -240,14 +231,19 @@ static inline void *write_block(struct btree *b)
        return (void *) b->data + (b->written << 9);
 }
 
+static inline bool __btree_addr_written(struct btree *b, void *p)
+{
+       return p < write_block(b);
+}
+
 static inline bool bset_written(struct btree *b, struct bset *i)
 {
-       return (void *) i < write_block(b);
+       return __btree_addr_written(b, i);
 }
 
-static inline bool bset_unwritten(struct btree *b, struct bset *i)
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
 {
-       return (void *) i > write_block(b);
+       return __btree_addr_written(b, k);
 }
 
 static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
@@ -306,10 +302,9 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
        return NULL;
 }
 
-static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
-                                     struct bkey_packed *k)
+static inline void unreserve_whiteout(struct btree *b, struct bkey_packed *k)
 {
-       if (bset_written(b, bset(b, t))) {
+       if (bkey_written(b, k)) {
                EBUG_ON(b->uncompacted_whiteout_u64s <
                        bkeyp_key_u64s(&b->format, k));
                b->uncompacted_whiteout_u64s -=
@@ -317,10 +312,9 @@ static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
        }
 }
 
-static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
-                                   struct bkey_packed *k)
+static inline void reserve_whiteout(struct btree *b, struct bkey_packed *k)
 {
-       if (bset_written(b, bset(b, t))) {
+       if (bkey_written(b, k)) {
                BUG_ON(!k->needs_whiteout);
                b->uncompacted_whiteout_u64s +=
                        bkeyp_key_u64s(&b->format, k);
@@ -332,40 +326,14 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
  * insert into could be written out from under us)
  */
 static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
-                                             struct btree *b, unsigned u64s)
+                                              struct btree *b, unsigned u64s)
 {
        if (unlikely(btree_node_fake(b)))
                return false;
 
-       if (btree_node_is_extents(b)) {
-               /* The insert key might split an existing key
-                * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
-                */
-               u64s += BKEY_EXTENT_U64s_MAX;
-       }
-
        return u64s <= bch_btree_keys_u64s_remaining(c, b);
 }
 
-static inline bool journal_res_insert_fits(struct btree_insert *trans,
-                                          struct btree_insert_entry *insert)
-{
-       unsigned u64s = 0;
-       struct btree_insert_entry *i;
-
-       /*
-        * If we didn't get a journal reservation, we're in journal replay and
-        * we're not journalling updates:
-        */
-       if (!trans->journal_res.ref)
-               return true;
-
-       for (i = insert; i < trans->entries + trans->nr; i++)
-               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
-       return u64s <= trans->journal_res.u64s;
-}
-
 ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
 
 size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
index a481b0d632d9c3af249171fbd7231b7c0bfe6a72..33c913f746604ed144bc3d3840373c86f264ef19 100644 (file)
@@ -24,7 +24,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 {
        const struct bkey_format *f = &b->format;
        struct bkey_packed *k;
-       struct bset_tree *t;
        unsigned clobber_u64s;
 
        EBUG_ON(btree_node_just_written(b));
@@ -37,9 +36,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        if (k && !bkey_cmp_packed(b, k, &insert->k)) {
                BUG_ON(bkey_whiteout(k));
 
-               t = bch2_bkey_to_bset(b, k);
-
-               if (bset_unwritten(b, bset(b, t)) &&
+               if (!bkey_written(b, k) &&
                    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
                    !bkey_whiteout(&insert->k)) {
                        k->type = insert->k.type;
@@ -50,9 +47,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
 
                insert->k.needs_whiteout = k->needs_whiteout;
 
-               btree_keys_account_key_drop(&b->nr, t - b->set, k);
+               btree_account_key_drop(b, k);
 
-               if (t == bset_tree_last(b)) {
+               if (k >= btree_bset_last(b)->start) {
                        clobber_u64s = k->u64s;
 
                        /*
@@ -62,8 +59,9 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                         */
                        if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
                                bch2_bset_delete(b, k, clobber_u64s);
-                               bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                                       k, clobber_u64s, 0);
+                               bch2_btree_node_iter_fix(iter, b, node_iter,
+                                                        k, clobber_u64s, 0);
+                               bch2_btree_iter_verify(iter, b);
                                return true;
                        }
 
@@ -71,11 +69,12 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                }
 
                k->type = KEY_TYPE_DELETED;
-               bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-                                       k->u64s, k->u64s);
+               bch2_btree_node_iter_fix(iter, b, node_iter, k,
+                                        k->u64s, k->u64s);
+               bch2_btree_iter_verify(iter, b);
 
                if (bkey_whiteout(&insert->k)) {
-                       reserve_whiteout(b, t, k);
+                       reserve_whiteout(b, k);
                        return true;
                } else {
                        k->needs_whiteout = false;
@@ -90,14 +89,14 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                insert->k.needs_whiteout = false;
        }
 
-       t = bset_tree_last(b);
-       k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+       k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
        clobber_u64s = 0;
 overwrite:
        bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
        if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
-               bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
-                                       clobber_u64s, k->u64s);
+               bch2_btree_node_iter_fix(iter, b, node_iter, k,
+                                        clobber_u64s, k->u64s);
+       bch2_btree_iter_verify(iter, b);
        return true;
 }
 
@@ -110,8 +109,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
 
        btree_node_lock_type(c, b, SIX_LOCK_read);
        bch2_btree_node_write_cond(c, b,
-                       (btree_current_write(b) == w &&
-                        w->journal.pin_list == journal_seq_pin(j, seq)));
+               (btree_current_write(b) == w && w->journal.seq == seq));
        six_unlock_read(&b->lock);
 }
 
@@ -297,6 +295,30 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 
 /* Normal update interface: */
 
+static enum btree_insert_ret
+btree_key_can_insert(struct btree_insert *trans,
+                     struct btree_insert_entry *insert,
+                     unsigned *u64s)
+{
+       struct bch_fs *c = trans->c;
+       struct btree *b = insert->iter->l[0].b;
+       static enum btree_insert_ret ret;
+
+       if (unlikely(btree_node_fake(b)))
+               return BTREE_INSERT_BTREE_NODE_FULL;
+
+       ret = !btree_node_is_extents(b)
+               ? BTREE_INSERT_OK
+               : bch2_extent_can_insert(trans, insert, u64s);
+       if (ret)
+               return ret;
+
+       if (*u64s > bch_btree_keys_u64s_remaining(c, b))
+               return BTREE_INSERT_BTREE_NODE_FULL;
+
+       return BTREE_INSERT_OK;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -309,14 +331,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        unsigned u64s;
        int ret;
 
-       trans_for_each_entry(trans, i) {
-               BUG_ON(i->done);
+       trans_for_each_entry(trans, i)
                BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
-       }
 
        u64s = 0;
        trans_for_each_entry(trans, i)
-               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+               u64s += jset_u64s(i->k->k.u64s);
 
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
@@ -336,24 +356,34 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
                goto out;
        }
 
+       /*
+        * Check if the insert will fit in the leaf node with the write lock
+        * held, otherwise another thread could write the node changing the
+        * amount of space available:
+        */
        u64s = 0;
        trans_for_each_entry(trans, i) {
                /* Multiple inserts might go to same leaf: */
                if (!same_leaf_as_prev(trans, i))
                        u64s = 0;
 
-               /*
-                * bch2_btree_node_insert_fits() must be called under write lock:
-                * with only an intent lock, another thread can still call
-                * bch2_btree_node_write(), converting an unwritten bset to a
-                * written one
-                */
-               u64s += i->k->k.u64s + i->extra_res;
-               if (!bch2_btree_node_insert_fits(c,
-                               i->iter->l[0].b, u64s)) {
+               u64s += i->k->k.u64s;
+               switch (btree_key_can_insert(trans, i, &u64s)) {
+               case BTREE_INSERT_OK:
+                       break;
+               case BTREE_INSERT_BTREE_NODE_FULL:
                        ret = -EINTR;
                        *split = i->iter;
                        goto out;
+               case BTREE_INSERT_ENOSPC:
+                       ret = -ENOSPC;
+                       goto out;
+               case BTREE_INSERT_NEED_GC_LOCK:
+                       ret = -EINTR;
+                       *cycle_gc_lock = true;
+                       goto out;
+               default:
+                       BUG();
                }
        }
 
@@ -369,34 +399,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        trans_for_each_entry(trans, i) {
                switch (btree_insert_key_leaf(trans, i)) {
                case BTREE_INSERT_OK:
-                       i->done = true;
                        break;
-               case BTREE_INSERT_JOURNAL_RES_FULL:
                case BTREE_INSERT_NEED_TRAVERSE:
-               case BTREE_INSERT_NEED_RESCHED:
-                       ret = -EINTR;
-                       break;
-               case BTREE_INSERT_BTREE_NODE_FULL:
-                       ret = -EINTR;
-                       *split = i->iter;
-                       break;
-               case BTREE_INSERT_ENOSPC:
-                       ret = -ENOSPC;
-                       break;
-               case BTREE_INSERT_NEED_GC_LOCK:
+                       BUG_ON((trans->flags & BTREE_INSERT_ATOMIC));
                        ret = -EINTR;
-                       *cycle_gc_lock = true;
-                       break;
+                       goto out;
                default:
                        BUG();
                }
-
-               /*
-                * If we did some work (i.e. inserted part of an extent),
-                * we have to do all the other updates as well:
-                */
-               if (!trans->did_work && (ret || *split))
-                       break;
        }
 out:
        multi_unlock_write(trans);
@@ -490,13 +500,8 @@ out:
                        bch2_btree_iter_verify_locks(linked);
                        BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
                               trans->did_work &&
-                              linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+                              !btree_node_locked(linked, 0));
                }
-
-               /* make sure we didn't lose an error: */
-               if (!ret)
-                       trans_for_each_entry(trans, i)
-                               BUG_ON(!i->done);
        }
 
        BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
@@ -581,29 +586,8 @@ err:
        goto out;
 }
 
-void bch2_trans_update(struct btree_trans *trans,
-                      struct btree_iter *iter,
-                      struct bkey_i *k,
-                      unsigned extra_journal_res)
-{
-       struct btree_insert_entry *i;
-
-       BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
-
-       i = &trans->updates[trans->nr_updates++];
-
-       *i = (struct btree_insert_entry) {
-               .iter   = iter,
-               .k              = k,
-               .extra_res      = extra_journal_res,
-       };
-
-       btree_insert_entry_checks(trans->c, i);
-}
-
 int bch2_trans_commit(struct btree_trans *trans,
                      struct disk_reservation *disk_res,
-                     struct extent_insert_hook *hook,
                      u64 *journal_seq,
                      unsigned flags)
 {
@@ -631,7 +615,7 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
        bkey_init(&k.k);
        k.k.p = iter->pos;
 
-       return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+       return bch2_btree_insert_at(iter->c, NULL, NULL,
                                    BTREE_INSERT_NOFAIL|
                                    BTREE_INSERT_USE_RESERVE|flags,
                                    BTREE_INSERT_ENTRY(iter, &k));
@@ -640,7 +624,6 @@ int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
 int bch2_btree_insert_list_at(struct btree_iter *iter,
                             struct keylist *keys,
                             struct disk_reservation *disk_res,
-                            struct extent_insert_hook *hook,
                             u64 *journal_seq, unsigned flags)
 {
        BUG_ON(flags & BTREE_INSERT_ATOMIC);
@@ -648,7 +631,7 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
        bch2_verify_keylist_sorted(keys);
 
        while (!bch2_keylist_empty(keys)) {
-               int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+               int ret = bch2_btree_insert_at(iter->c, disk_res,
                                journal_seq, flags,
                                BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
                if (ret)
@@ -670,7 +653,6 @@ int bch2_btree_insert_list_at(struct btree_iter *iter,
 int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
                     struct bkey_i *k,
                     struct disk_reservation *disk_res,
-                    struct extent_insert_hook *hook,
                     u64 *journal_seq, int flags)
 {
        struct btree_iter iter;
@@ -678,7 +660,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
 
        bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
                             BTREE_ITER_INTENT);
-       ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+       ret = bch2_btree_insert_at(c, disk_res, journal_seq, flags,
                                   BTREE_INSERT_ENTRY(&iter, k));
        bch2_btree_iter_unlock(&iter);
 
@@ -691,12 +673,8 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
  * Range is a half open interval - [start, end)
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-                          struct bpos start,
-                          struct bpos end,
-                          struct bversion version,
-                          struct disk_reservation *disk_res,
-                          struct extent_insert_hook *hook,
-                          u64 *journal_seq)
+                           struct bpos start, struct bpos end,
+                           u64 *journal_seq)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -706,14 +684,12 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                             BTREE_ITER_INTENT);
 
        while ((k = bch2_btree_iter_peek(&iter)).k &&
-              !(ret = btree_iter_err(k))) {
+              !(ret = btree_iter_err(k)) &&
+              bkey_cmp(iter.pos, end) < 0) {
                unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
                /* really shouldn't be using a bare, unpadded bkey_i */
                struct bkey_i delete;
 
-               if (bkey_cmp(iter.pos, end) >= 0)
-                       break;
-
                bkey_init(&delete.k);
 
                /*
@@ -727,7 +703,6 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                 * bkey_start_pos(k.k)).
                 */
                delete.k.p = iter.pos;
-               delete.k.version = version;
 
                if (iter.flags & BTREE_ITER_IS_EXTENTS) {
                        /* create the biggest key we can */
@@ -735,7 +710,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                        bch2_cut_back(end, &delete.k);
                }
 
-               ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+               ret = bch2_btree_insert_at(c, NULL, journal_seq,
                                           BTREE_INSERT_NOFAIL,
                                           BTREE_INSERT_ENTRY(&iter, &delete));
                if (ret)
index 43112445040501828e49274b2d1fde18b09f98ad..801f6c3735028670aeda833df3d8d615a421b8d0 100644 (file)
@@ -72,6 +72,8 @@
 #include <linux/preempt.h>
 #include <trace/events/bcachefs.h>
 
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+
 #ifdef DEBUG_BUCKETS
 
 #define lg_local_lock  lg_global_lock
@@ -81,22 +83,26 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 {
        struct bch_fs_usage stats =
                __bch2_fs_usage_read(c);
-       unsigned i;
-
-       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-               if ((s64) stats.s[i].data[S_META] < 0)
-                       panic("replicas %u meta underflow: %lli\n",
-                             i + 1, stats.s[i].data[S_META]);
+       unsigned i, j;
 
-               if ((s64) stats.s[i].data[S_DIRTY] < 0)
-                       panic("replicas %u dirty underflow: %lli\n",
-                             i + 1, stats.s[i].data[S_DIRTY]);
+       for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+               for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
+                       if ((s64) stats.replicas[i].data[j] < 0)
+                               panic("replicas %u %s sectors underflow: %lli\n",
+                                     i + 1, bch_data_types[j],
+                                     stats.replicas[i].data[j]);
 
-               if ((s64) stats.s[i].persistent_reserved < 0)
+               if ((s64) stats.replicas[i].persistent_reserved < 0)
                        panic("replicas %u reserved underflow: %lli\n",
-                             i + 1, stats.s[i].persistent_reserved);
+                             i + 1, stats.replicas[i].persistent_reserved);
        }
 
+       for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
+               if ((s64) stats.replicas[i].data_buckets[j] < 0)
+                       panic("%s buckets underflow: %lli\n",
+                             bch_data_types[j],
+                             stats.buckets[j]);
+
        if ((s64) stats.online_reserved < 0)
                panic("sectors_online_reserved underflow: %lli\n",
                      stats.online_reserved);
@@ -146,6 +152,7 @@ static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
  */
 void bch2_bucket_seq_cleanup(struct bch_fs *c)
 {
+       u64 journal_seq = atomic64_read(&c->journal.seq);
        u16 last_seq_ondisk = c->journal.last_seq_ondisk;
        struct bch_dev *ca;
        struct bucket_array *buckets;
@@ -153,6 +160,12 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
        struct bucket_mark m;
        unsigned i;
 
+       if (journal_seq - c->last_bucket_seq_cleanup <
+           (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
+               return;
+
+       c->last_bucket_seq_cleanup = journal_seq;
+
        for_each_member_device(ca, c, i) {
                down_read(&ca->bucket_lock);
                buckets = bucket_array(ca);
@@ -232,7 +245,9 @@ bch2_fs_usage_read(struct bch_fs *c)
 }
 
 struct fs_usage_sum {
+       u64     hidden;
        u64     data;
+       u64     cached;
        u64     reserved;
 };
 
@@ -241,10 +256,19 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
        struct fs_usage_sum sum = { 0 };
        unsigned i;
 
-       for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
-               sum.data += (stats.s[i].data[S_META] +
-                            stats.s[i].data[S_DIRTY]) * (i + 1);
-               sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+       /*
+        * For superblock and journal we count bucket usage, not sector usage,
+        * because any internal fragmentation should _not_ be counted as
+        * free space:
+        */
+       sum.hidden += stats.buckets[BCH_DATA_SB];
+       sum.hidden += stats.buckets[BCH_DATA_JOURNAL];
+
+       for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
+               sum.data        += stats.replicas[i].data[BCH_DATA_BTREE];
+               sum.data        += stats.replicas[i].data[BCH_DATA_USER];
+               sum.cached      += stats.replicas[i].data[BCH_DATA_CACHED];
+               sum.reserved    += stats.replicas[i].persistent_reserved;
        }
 
        sum.reserved += stats.online_reserved;
@@ -260,14 +284,14 @@ static u64 reserve_factor(u64 r)
 
 static u64 avail_factor(u64 r)
 {
-       return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
+       return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
+static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
 {
        struct fs_usage_sum sum = __fs_usage_sum(stats);
 
-       return sum.data + reserve_factor(sum.reserved);
+       return sum.hidden + sum.data + reserve_factor(sum.reserved);
 }
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
@@ -275,9 +299,9 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
        return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
-u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
+static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
 {
-       return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
+       return c->capacity - bch2_fs_sectors_used(c, stats);
 }
 
 static inline int is_unavailable_bucket(struct bucket_mark m)
@@ -313,9 +337,9 @@ static bool bucket_became_unavailable(struct bch_fs *c,
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
-                       struct bch_fs_usage *stats,
-                       struct disk_reservation *disk_res,
-                       struct gc_pos gc_pos)
+                        struct bch_fs_usage *stats,
+                        struct disk_reservation *disk_res,
+                        struct gc_pos gc_pos)
 {
        struct fs_usage_sum sum = __fs_usage_sum(*stats);
        s64 added = sum.data + sum.reserved;
@@ -347,21 +371,21 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+                                 struct bch_fs_usage *stats,
                                  struct bucket_mark old, struct bucket_mark new)
 {
        struct bch_dev_usage *dev_usage;
 
-       if (c)
-               percpu_rwsem_assert_held(&c->usage_lock);
+       percpu_rwsem_assert_held(&c->usage_lock);
 
-       if (old.data_type && new.data_type &&
-           old.data_type != new.data_type) {
-               BUG_ON(!c);
-               bch2_fs_inconsistent(c,
-                       "different types of data in same bucket: %s, %s",
-                       bch2_data_types[old.data_type],
-                       bch2_data_types[new.data_type]);
-       }
+       bch2_fs_inconsistent_on(old.data_type && new.data_type &&
+                               old.data_type != new.data_type, c,
+               "different types of data in same bucket: %s, %s",
+               bch2_data_types[old.data_type],
+               bch2_data_types[new.data_type]);
+
+       stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+       stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
 
        dev_usage = this_cpu_ptr(ca->usage_percpu);
 
@@ -386,17 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
        bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(c, ca, g, new, expr)               \
+#define bucket_data_cmpxchg(c, ca, stats, g, new, expr)                \
 ({                                                             \
        struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
                                                                \
-       bch2_dev_usage_update(c, ca, _old, new);                \
+       bch2_dev_usage_update(c, ca, stats, _old, new);         \
        _old;                                                   \
 })
 
-bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, struct bucket_mark *old)
 {
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
        struct bucket *g;
        struct bucket_mark new;
 
@@ -404,11 +429,8 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        g = bucket(ca, b);
 
-       *old = bucket_data_cmpxchg(c, ca, g, new, ({
-               if (!is_available_bucket(new)) {
-                       percpu_up_read_preempt_enable(&c->usage_lock);
-                       return false;
-               }
+       *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+               BUG_ON(!is_available_bucket(new));
 
                new.owned_by_allocator  = 1;
                new.data_type           = 0;
@@ -417,16 +439,22 @@ bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.gen++;
        }));
 
+       /*
+        * This isn't actually correct yet, since fs usage is still
+        * uncompressed sectors:
+        */
+       stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+
        if (!old->owned_by_allocator && old->cached_sectors)
                trace_invalidate(ca, bucket_to_sector(ca, b),
                                 old->cached_sectors);
-       return true;
 }
 
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, bool owned_by_allocator,
                            struct gc_pos pos, unsigned flags)
 {
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
        struct bucket *g;
        struct bucket_mark old, new;
 
@@ -437,7 +465,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
            gc_will_visit(c, pos))
                return;
 
-       old = bucket_data_cmpxchg(c, ca, g, new, ({
+       old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
@@ -445,17 +473,11 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
               c->gc_pos.phase == GC_PHASE_DONE);
 }
 
-#define saturated_add(ca, dst, src, max)                       \
+#define checked_add(a, b)                                      \
 do {                                                           \
-       BUG_ON((int) (dst) + (src) < 0);                        \
-       if ((dst) == (max))                                     \
-               ;                                               \
-       else if ((dst) + (src) <= (max))                        \
-               dst += (src);                                   \
-       else {                                                  \
-               dst = (max);                                    \
-               trace_sectors_saturated(ca);            \
-       }                                                       \
+       unsigned _res = (unsigned) (a) + (b);                   \
+       (a) = _res;                                             \
+       BUG_ON((a) != _res);                                    \
 } while (0)
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -463,10 +485,12 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                               unsigned sectors, struct gc_pos pos,
                               unsigned flags)
 {
+       struct bch_fs_usage *stats;
        struct bucket *g;
        struct bucket_mark old, new;
 
-       BUG_ON(!type);
+       BUG_ON(type != BCH_DATA_SB &&
+              type != BCH_DATA_JOURNAL);
 
        if (likely(c)) {
                percpu_rwsem_assert_held(&c->usage_lock);
@@ -474,25 +498,32 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
                    gc_will_visit(c, pos))
                        return;
-       }
 
-       rcu_read_lock();
+               stats = this_cpu_ptr(c->usage_percpu);
 
-       g = bucket(ca, b);
-       old = bucket_data_cmpxchg(c, ca, g, new, ({
-               saturated_add(ca, new.dirty_sectors, sectors,
-                             GC_MAX_SECTORS_USED);
-               new.data_type           = type;
-       }));
+               g = bucket(ca, b);
+               old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
+                       new.data_type = type;
+                       checked_add(new.dirty_sectors, sectors);
+               }));
+
+               stats->replicas[0].data[type] += sectors;
+       } else {
+               rcu_read_lock();
 
-       rcu_read_unlock();
+               g = bucket(ca, b);
+               old = bucket_cmpxchg(g, new, ({
+                       new.data_type = type;
+                       checked_add(new.dirty_sectors, sectors);
+               }));
+
+               rcu_read_unlock();
+       }
 
        BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
               bucket_became_unavailable(c, old, new));
 }
 
-/* Reverting this until the copygc + compression issue is fixed: */
-
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
        if (!sectors)
@@ -511,16 +542,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              struct bkey_s_c_extent e,
                              const struct bch_extent_ptr *ptr,
                              struct bch_extent_crc_unpacked crc,
-                             s64 sectors, enum s_alloc type,
-                             struct bch_fs_usage *stats,
+                             s64 sectors, enum bch_data_type data_type,
+                             unsigned replicas,
+                             struct bch_fs_usage *fs_usage,
                              u64 journal_seq, unsigned flags)
 {
        struct bucket_mark old, new;
-       unsigned saturated;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        struct bucket *g = PTR_BUCKET(ca, ptr);
-       enum bch_data_type data_type = type == S_META
-               ? BCH_DATA_BTREE : BCH_DATA_USER;
+       s64 uncompressed_sectors = sectors;
        u64 v;
 
        if (crc.compression_type) {
@@ -538,6 +568,20 @@ static void bch2_mark_pointer(struct bch_fs *c,
                          +__disk_sectors(crc, new_sectors);
        }
 
+       /*
+        * fs level usage (which determines free space) is in uncompressed
+        * sectors, until copygc + compression is sorted out:
+        *
+        * note also that we always update @fs_usage, even when we otherwise
+        * wouldn't do anything because gc is running - this is because the
+        * caller still needs to account w.r.t. its disk reservation. It is
+        * caller's responsibility to not apply @fs_usage if gc is in progress.
+        */
+       fs_usage->replicas
+               [!ptr->cached && replicas ? replicas - 1 : 0].data
+               [!ptr->cached ? data_type : BCH_DATA_CACHED] +=
+                       uncompressed_sectors;
+
        if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
                if (journal_seq)
                        bucket_cmpxchg(g, new, ({
@@ -551,7 +595,6 @@ static void bch2_mark_pointer(struct bch_fs *c,
        v = atomic64_read(&g->_mark.v);
        do {
                new.v.counter = old.v.counter = v;
-               saturated = 0;
 
                /*
                 * Check this after reading bucket mark to guard against
@@ -565,17 +608,10 @@ static void bch2_mark_pointer(struct bch_fs *c,
                        return;
                }
 
-               if (!ptr->cached &&
-                   new.dirty_sectors == GC_MAX_SECTORS_USED &&
-                   sectors < 0)
-                       saturated = -sectors;
-
-               if (ptr->cached)
-                       saturated_add(ca, new.cached_sectors, sectors,
-                                     GC_MAX_SECTORS_USED);
+               if (!ptr->cached)
+                       checked_add(new.dirty_sectors, sectors);
                else
-                       saturated_add(ca, new.dirty_sectors, sectors,
-                                     GC_MAX_SECTORS_USED);
+                       checked_add(new.cached_sectors, sectors);
 
                if (!new.dirty_sectors &&
                    !new.cached_sectors) {
@@ -597,28 +633,22 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, old, new);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new);
 
        BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
               bucket_became_unavailable(c, old, new));
-
-       if (saturated &&
-           atomic_long_add_return(saturated,
-                                  &ca->saturated_count) >=
-           bucket_to_sector(ca, ca->free_inc.size)) {
-               if (c->gc_thread) {
-                       trace_gc_sectors_saturated(c);
-                       wake_up_process(c->gc_thread);
-               }
-       }
 }
 
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-                  s64 sectors, bool metadata,
+                  s64 sectors, enum bch_data_type data_type,
                   struct gc_pos pos,
                   struct bch_fs_usage *stats,
                   u64 journal_seq, unsigned flags)
 {
+       unsigned replicas = bch2_extent_nr_dirty_ptrs(k);
+
+       BUG_ON(replicas && replicas - 1 > ARRAY_SIZE(stats->replicas));
+
        /*
         * synchronization w.r.t. GC:
         *
@@ -661,34 +691,20 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const struct bch_extent_ptr *ptr;
                struct bch_extent_crc_unpacked crc;
-               enum s_alloc type = metadata ? S_META : S_DIRTY;
-               unsigned replicas = 0;
 
-               BUG_ON(metadata && bkey_extent_is_cached(e.k));
                BUG_ON(!sectors);
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
-                       bch2_mark_pointer(c, e, ptr, crc, sectors, type,
-                                         stats, journal_seq, flags);
-                       replicas += !ptr->cached;
-               }
-
-               if (replicas) {
-                       BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
-                       stats->s[replicas - 1].data[type] += sectors;
-               }
+               extent_for_each_ptr_crc(e, ptr, crc)
+                       bch2_mark_pointer(c, e, ptr, crc, sectors, data_type,
+                                         replicas, stats, journal_seq, flags);
                break;
        }
-       case BCH_RESERVATION: {
-               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-               if (r.v->nr_replicas) {
-                       BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
-                       stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-               }
+       case BCH_RESERVATION:
+               if (replicas)
+                       stats->replicas[replicas - 1].persistent_reserved +=
+                               sectors * replicas;
                break;
        }
-       }
        percpu_up_read_preempt_enable(&c->usage_lock);
 }
 
@@ -701,7 +717,7 @@ static u64 __recalc_sectors_available(struct bch_fs *c)
        for_each_possible_cpu(cpu)
                per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-       return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+       return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
 /* Used by gc when it's starting: */
@@ -833,9 +849,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
                             ca->mi.bucket_size / c->opts.btree_node_size);
        /* XXX: these should be tunable */
-       size_t reserve_none     = max_t(size_t, 4, ca->mi.nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 16, ca->mi.nbuckets >> 7);
-       size_t free_inc_reserve = copygc_reserve / 2;
+       size_t reserve_none     = max_t(size_t, 4, nbuckets >> 9);
+       size_t copygc_reserve   = max_t(size_t, 16, nbuckets >> 7);
+       size_t free_inc_nr      = max(max_t(size_t, 16, nbuckets >> 12),
+                                     btree_reserve);
        bool resize = ca->buckets != NULL,
             start_copygc = ca->copygc_thread != NULL;
        int ret = -ENOMEM;
@@ -858,8 +875,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
            !init_fifo(&free[RESERVE_MOVINGGC],
                       copygc_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-           !init_fifo(&free_inc,       free_inc_reserve, GFP_KERNEL) ||
-           !init_heap(&alloc_heap,     free_inc_reserve, GFP_KERNEL) ||
+           !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
+           !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
            !init_heap(&copygc_heap,    copygc_reserve, GFP_KERNEL))
                goto err;
 
index 4deb6c37391c087b680b42578d29ee99dbc9a85e..ff86d23e15e47aae7d6e7d8757814ab8ec53b311 100644 (file)
@@ -114,11 +114,6 @@ static inline u8 ptr_stale(struct bch_dev *ca,
 
 /* bucket gc marks */
 
-/* The dirty and cached sector counts saturate. If this occurs,
- * reference counting alone will not free the bucket, and a btree
- * GC must be performed. */
-#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
-
 static inline unsigned bucket_sectors_used(struct bucket_mark mark)
 {
        return mark.dirty_sectors + mark.cached_sectors;
@@ -172,26 +167,12 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
-{
-       switch (s) {
-       case S_META:
-               return BCH_DATA_BTREE;
-       case S_DIRTY:
-               return BCH_DATA_USER;
-       default:
-               BUG();
-       }
-}
-
 struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                         struct disk_reservation *, struct gc_pos);
 
-u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
 
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
@@ -209,7 +190,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 
-bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
                            size_t, struct bucket_mark *);
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
                            size_t, bool, struct gc_pos, unsigned);
@@ -222,8 +203,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 #define BCH_BUCKET_MARK_GC_WILL_VISIT          (1 << 2)
 #define BCH_BUCKET_MARK_GC_LOCK_HELD           (1 << 3)
 
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
-                  struct bch_fs_usage *, u64, unsigned);
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, enum bch_data_type,
+                  struct gc_pos, struct bch_fs_usage *, u64, unsigned);
 
 void bch2_recalc_sectors_available(struct bch_fs *);
 
index 10f00861385e98ae1bb3184a251843c44b01c0e9..6f7d3a23d2f09862456e4dba2f2cd1d1c87ef747 100644 (file)
@@ -1,8 +1,11 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+#include "bcachefs_format.h"
 #include "util.h"
 
+#define BUCKET_JOURNAL_SEQ_BITS                16
+
 struct bucket_mark {
        union {
        struct {
@@ -56,23 +59,17 @@ struct bch_dev_usage {
        u64                     sectors_fragmented;
 };
 
-/* kill, switch to bch_data_type? */
-enum s_alloc {
-       S_META,
-       S_DIRTY,
-       S_ALLOC_NR,
-};
-
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
-       /* _uncompressed_ sectors: */
        u64                     online_reserved;
        u64                     available_cache;
 
        struct {
-               u64             data[S_ALLOC_NR];
+               u64             data[BCH_DATA_NR];
                u64             persistent_reserved;
-       }                       s[BCH_REPLICAS_MAX];
+       }                       replicas[BCH_REPLICAS_MAX];
+
+       u64                     buckets[BCH_DATA_NR];
 };
 
 /*
index 5593b9a1de27cc7a737cb46a88fd7b204d5f4da9..c18079f9c0cb830b07384e3ef0863e490779bde9 100644 (file)
@@ -403,11 +403,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 
                for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                        dst.persistent_reserved[i] =
-                               src.s[i].persistent_reserved;
+                               src.replicas[i].persistent_reserved;
 
-                       for (j = 0; j < S_ALLOC_NR; j++)
-                               dst.sectors[s_alloc_to_data_type(j)][i] =
-                                       src.s[i].data[j];
+                       for (j = 0; j < BCH_DATA_NR; j++)
+                               dst.sectors[j][i] = src.replicas[i].data[j];
                }
 
                ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
index d979ae0eaa17ec4de6d6270428e65f01e6191c74..5f3e16b1ab868de7f2a915b72e184d9d064191df 100644 (file)
@@ -121,24 +121,26 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
        }
 }
 
-void bch2_dirent_to_text(struct bch_fs *c, char *buf,
-                        size_t size, struct bkey_s_c k)
+int bch2_dirent_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
 {
+       char *out = buf, *end = buf + size;
        struct bkey_s_c_dirent d;
-       size_t n = 0;
 
        switch (k.k->type) {
        case BCH_DIRENT:
                d = bkey_s_c_to_dirent(k);
 
-               n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
-                                  bch2_dirent_name_bytes(d));
-               n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
+               out += bch_scnmemcpy(out, end - out, d.v->d_name,
+                                    bch2_dirent_name_bytes(d));
+               out += scnprintf(out, end - out, " -> %llu", d.v->d_inum);
                break;
        case BCH_DIRENT_WHITEOUT:
-               scnprintf(buf, size, "whiteout");
+               out += scnprintf(out, end - out, "whiteout");
                break;
        }
+
+       return out - buf;
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -289,7 +291,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                 * new_dst at the src position:
                                 */
                                new_dst->k.p = src_iter->pos;
-                               bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
+                               bch2_trans_update(trans,
+                                       BTREE_INSERT_ENTRY(src_iter,
+                                                          &new_dst->k_i));
                                return 0;
                        } else {
                                /* If we're overwriting, we can't insert new_dst
@@ -312,8 +316,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
                }
        }
 
-       bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-       bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i));
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i));
        return 0;
 }
 
index 4d92ffba144ee13b345513c2887a5f60e0f352d1..9fe32b9b2aab8a2b8ba412ed74f2164e04b19a84 100644 (file)
@@ -6,7 +6,7 @@
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
 const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_dirent_ops (struct bkey_ops) {       \
        .key_invalid    = bch2_dirent_invalid,          \
index fe4bb52717cf9f518d2fa7f361e2f143438639e8..a4d7e52bcbd8da36cc27926084cf0fbfe0fde520 100644 (file)
@@ -733,8 +733,8 @@ err:
                      mark.gen, (unsigned) mark.v.counter);
 }
 
-void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-                           size_t size, struct bkey_s_c k)
+int bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+                          size_t size, struct bkey_s_c k)
 {
        char *out = buf, *end = buf + size;
        const char *invalid;
@@ -748,6 +748,7 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
        if (invalid)
                p(" invalid: %s", invalid);
 #undef p
+       return out - buf;
 }
 
 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
@@ -857,30 +858,34 @@ void bch2_key_resize(struct bkey *k,
  * that we have to unpack the key, modify the unpacked key - then this
  * copies/repacks the unpacked to the original as necessary.
  */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
-                         struct bkey_packed *dst, struct bkey *src)
+static void extent_save(struct btree *b, struct bkey_packed *dst,
+                       struct bkey *src)
 {
        struct bkey_format *f = &b->format;
        struct bkey_i *dst_unpacked;
-       bool ret;
 
-       if ((dst_unpacked = packed_to_bkey(dst))) {
+       if ((dst_unpacked = packed_to_bkey(dst)))
                dst_unpacked->k = *src;
-               ret = true;
-       } else {
-               ret = bch2_bkey_pack_key(dst, src, f);
-       }
-
-       if (ret && iter)
-               bch2_verify_key_order(b, iter, dst);
-
-       return ret;
+       else
+               BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
-                       struct bkey_packed *dst, struct bkey *src)
+static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
+                         struct bkey_i *src)
 {
-       BUG_ON(!__extent_save(b, iter, dst, src));
+       struct bkey_format *f = &b->format;
+       struct bkey_i *dst_unpacked;
+       struct bkey_packed tmp;
+
+       if ((dst_unpacked = packed_to_bkey(dst)))
+               dst_unpacked->k = src->k;
+       else if (bch2_bkey_pack_key(&tmp, &src->k, f))
+               memcpy_u64s(dst, &tmp, f->key_u64s);
+       else
+               return false;
+
+       memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
+       return true;
 }
 
 /*
@@ -1009,7 +1014,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                sort_key_next(iter, b, _r);
                        } else {
                                __bch2_cut_front(l.k->p, r);
-                               extent_save(b, NULL, rk, r.k);
+                               extent_save(b, rk, r.k);
                        }
 
                        extent_sort_sift(iter, b, _r - iter->data);
@@ -1023,7 +1028,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                        bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
 
                        __bch2_cut_front(r.k->p, l);
-                       extent_save(b, NULL, lk, l.k);
+                       extent_save(b, lk, l.k);
 
                        extent_sort_sift(iter, b, 0);
 
@@ -1031,7 +1036,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                           bkey_to_packed(&tmp.k));
                } else {
                        bch2_cut_back(bkey_start_pos(r.k), l.k);
-                       extent_save(b, NULL, lk, l.k);
+                       extent_save(b, lk, l.k);
                }
        }
 
@@ -1055,7 +1060,8 @@ struct extent_insert_state {
 
        /* for deleting: */
        struct bkey_i                   whiteout;
-       bool                            do_journal;
+       bool                            update_journal;
+       bool                            update_btree;
        bool                            deleting;
 };
 
@@ -1070,7 +1076,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
        if (!sectors)
                return;
 
-       bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
+       bch2_mark_key(c, k, sectors, BCH_DATA_USER, gc_pos_btree_node(b),
                      &s->stats, s->trans->journal_res.seq, 0);
 }
 
@@ -1112,197 +1118,197 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
                                     struct bkey_packed *,
                                     bool);
 
-#define MAX_LOCK_HOLD_TIME     (5 * NSEC_PER_MSEC)
-
-static enum btree_insert_ret
-extent_insert_should_stop(struct extent_insert_state *s)
+static void verify_extent_nonoverlapping(struct btree *b,
+                                        struct btree_node_iter *_iter,
+                                        struct bkey_i *insert)
 {
-       struct btree *b = s->insert->iter->l[0].b;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree_node_iter iter;
+       struct bkey_packed *k;
+       struct bkey uk;
 
-       /*
-        * Check if we have sufficient space in both the btree node and the
-        * journal reservation:
-        *
-        * Each insert checks for room in the journal entry, but we check for
-        * room in the btree node up-front. In the worst case, bkey_cmpxchg()
-        * will insert two keys, and one iteration of this room will insert one
-        * key, so we need room for three keys.
-        */
-       if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
-               return BTREE_INSERT_BTREE_NODE_FULL;
-       else if (!journal_res_insert_fits(s->trans, s->insert))
-               return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
-       else
-               return BTREE_INSERT_OK;
+       iter = *_iter;
+       k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
+#if 0
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+       if (k &&
+           (uk = bkey_unpack_key(b, k),
+            bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+               char buf1[100];
+               char buf2[100];
+
+               bch2_bkey_to_text(buf1, sizeof(buf1), &insert->k);
+               bch2_bkey_to_text(buf2, sizeof(buf2), &uk);
+
+               bch2_dump_btree_node(b);
+               panic("insert > next :\n"
+                     "insert %s\n"
+                     "next   %s\n",
+                     buf1, buf2);
+       }
+#endif
+
+#endif
+}
+
+static void verify_modified_extent(struct btree_iter *iter,
+                                  struct bkey_packed *k)
+{
+       bch2_btree_iter_verify(iter, iter->l[0].b);
+       bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
 }
 
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
                               struct bkey_i *insert)
 {
        struct btree_iter_level *l = &iter->l[0];
-       struct bset_tree *t = bset_tree_last(l->b);
-       struct bkey_packed *where =
-               bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
-       struct bkey_packed *prev = bch2_bkey_prev_filter(l->b, t, where,
-                                                        KEY_TYPE_DISCARD);
-       struct bkey_packed *next_live_key = where;
-       unsigned clobber_u64s;
-
-       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       struct btree_node_iter node_iter;
+       struct bkey_packed *k;
 
-       if (prev)
-               where = bkey_next(prev);
+       BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
 
-       while (next_live_key != btree_bkey_last(l->b, t) &&
-              bkey_deleted(next_live_key))
-               next_live_key = bkey_next(next_live_key);
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       verify_extent_nonoverlapping(l->b, &l->iter, insert);
 
-       /*
-        * Everything between where and next_live_key is now deleted keys, and
-        * is overwritten:
-        */
-       clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+       node_iter = l->iter;
+       k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+       if (k && !bkey_written(l->b, k) &&
+           bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
+               return;
 
-       if (prev &&
-           bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
-               goto drop_deleted_keys;
+       node_iter = l->iter;
+       k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
+       if (k && !bkey_written(l->b, k) &&
+           bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
+               return;
 
-       if (next_live_key != btree_bkey_last(l->b, t) &&
-           bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
-                                   next_live_key, false))
-               goto drop_deleted_keys;
+       k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
-       bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-                               clobber_u64s, where->u64s);
-       return;
-drop_deleted_keys:
-       bch2_bset_delete(l->b, where, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-                                where, clobber_u64s, 0);
+       bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+       bch2_btree_iter_verify(iter, l->b);
 }
 
 static void extent_insert_committed(struct extent_insert_state *s)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
-       struct bkey_i *insert = !s->deleting
-               ? s->insert->k
-               : &s->whiteout;
+       struct bkey_i *insert = s->insert->k;
        BKEY_PADDED(k) split;
 
-       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
        EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
        EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
 
-       if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+       bkey_copy(&split.k, insert);
+       if (s->deleting)
+               split.k.k.type = KEY_TYPE_DISCARD;
+
+       if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+               bch2_cut_subtract_back(s, s->committed,
+                                      bkey_i_to_s(&split.k));
+       else
+               bch2_cut_back(s->committed, &split.k.k);
+
+       if (!bkey_cmp(s->committed, iter->pos))
                return;
 
-       if (s->deleting && !s->do_journal) {
-               bch2_cut_front(s->committed, insert);
-               goto done;
-       }
+       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 
-       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       if (s->update_btree) {
+               if (debug_check_bkeys(c))
+                       bch2_bkey_debugcheck(c, iter->l[0].b,
+                                            bkey_i_to_s_c(&split.k));
 
-       bkey_copy(&split.k, insert);
+               EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
 
-       if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-           bkey_cmp(s->committed, insert->k.p) &&
-           bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
-               /* XXX: possibly need to increase our reservation? */
-               bch2_cut_subtract_back(s, s->committed,
-                                     bkey_i_to_s(&split.k));
-               bch2_cut_front(s->committed, insert);
-               bch2_add_sectors(s, bkey_i_to_s_c(insert),
-                               bkey_start_offset(&insert->k),
-                               insert->k.size);
-       } else {
-               bch2_cut_back(s->committed, &split.k.k);
-               bch2_cut_front(s->committed, insert);
+               extent_bset_insert(c, iter, &split.k);
        }
 
-       if (debug_check_bkeys(c))
-               bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
+       if (s->update_journal) {
+               bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
+               if (s->deleting)
+                       split.k.k.type = KEY_TYPE_DISCARD;
 
-       bch2_btree_journal_key(s->trans, iter, &split.k);
+               bch2_cut_back(s->committed, &split.k.k);
 
-       if (!s->deleting)
-               extent_bset_insert(c, iter, &split.k);
-done:
-       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+               EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
+
+               bch2_btree_journal_key(s->trans, iter, &split.k);
+       }
+
+       bch2_cut_front(s->committed, insert);
 
        insert->k.needs_whiteout        = false;
-       s->do_journal                   = false;
        s->trans->did_work              = true;
 }
 
-static enum btree_insert_ret
-__extent_insert_advance_pos(struct extent_insert_state *s,
-                           struct bpos next_pos,
-                           struct bkey_s_c k)
+void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-       struct extent_insert_hook *hook = s->trans->hook;
-       enum btree_insert_ret ret;
+       struct btree *b = iter->l[0].b;
 
-       if (hook)
-               ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
-       else
-               ret = BTREE_INSERT_OK;
+       BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 
-       if (ret == BTREE_INSERT_OK)
-               s->committed = next_pos;
+       bch2_cut_back(b->key.k.p, &k->k);
 
-       return ret;
+       BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
 }
 
-/*
- * Update iter->pos, marking how much of @insert we've processed, and call hook
- * fn:
- */
-static enum btree_insert_ret
-extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *trans,
+                      struct btree_insert_entry *insert,
+                      unsigned *u64s)
 {
-       struct btree *b = s->insert->iter->l[0].b;
-       struct bpos next_pos = bpos_min(s->insert->k->k.p,
-                                       k.k ? k.k->p : b->key.k.p);
-       enum btree_insert_ret ret;
+       struct btree_iter_level *l = &insert->iter->l[0];
+       struct btree_node_iter node_iter = l->iter;
+       enum bch_extent_overlap overlap;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_s_c k;
+       int sectors;
 
-       if (race_fault())
-               return BTREE_INSERT_NEED_TRAVERSE;
+       BUG_ON(trans->flags & BTREE_INSERT_ATOMIC &&
+              !bch2_extent_is_atomic(&insert->k->k, insert->iter));
 
-       /* hole? */
-       if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-               ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
-                                                   bkey_s_c_null);
-               if (ret != BTREE_INSERT_OK)
-                       return ret;
-       }
+       /*
+        * We avoid creating whiteouts whenever possible when deleting, but
+        * those optimizations mean we may potentially insert two whiteouts
+        * instead of one (when we overlap with the front of one extent and the
+        * back of another):
+        */
+       if (bkey_whiteout(&insert->k->k))
+               *u64s += BKEY_U64s;
 
-       /* avoid redundant calls to hook fn: */
-       if (!bkey_cmp(s->committed, next_pos))
+       _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+                                             KEY_TYPE_DISCARD);
+       if (!_k)
                return BTREE_INSERT_OK;
 
-       return __extent_insert_advance_pos(s, next_pos, k);
-}
+       k = bkey_disassemble(l->b, _k, &unpacked);
 
-static enum btree_insert_ret
-extent_insert_check_split_compressed(struct extent_insert_state *s,
-                                    struct bkey_s_c k,
-                                    enum bch_extent_overlap overlap)
-{
-       struct bch_fs *c = s->trans->c;
-       unsigned sectors;
+       overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+       /* account for having to split existing extent: */
+       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+               *u64s += _k->u64s;
 
        if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
            (sectors = bch2_extent_is_compressed(k))) {
                int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
-               if (s->trans->flags & BTREE_INSERT_NOFAIL)
+               if (trans->flags & BTREE_INSERT_NOFAIL)
                        flags |= BCH_DISK_RESERVATION_NOFAIL;
 
-               switch (bch2_disk_reservation_add(c,
-                               s->trans->disk_res,
+               switch (bch2_disk_reservation_add(trans->c,
+                               trans->disk_res,
                                sectors * bch2_extent_nr_dirty_ptrs(k),
                                flags)) {
                case 0:
@@ -1319,78 +1325,60 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
        return BTREE_INSERT_OK;
 }
 
-static enum btree_insert_ret
+static void
 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
-             struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+             struct bkey_packed *_k, struct bkey_s k,
              enum bch_extent_overlap overlap)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       enum btree_insert_ret ret;
 
        switch (overlap) {
        case BCH_EXTENT_OVERLAP_FRONT:
                /* insert overlaps with start of k: */
                bch2_cut_subtract_front(s, insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
+               verify_modified_extent(iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_BACK:
                /* insert overlaps with end of k: */
                bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
 
                /*
                 * As the auxiliary tree is indexed by the end of the
                 * key and we've just changed the end, update the
                 * auxiliary tree.
                 */
-               bch2_bset_fix_invalidated_key(b, t, _k);
-               bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                       _k, _k->u64s, _k->u64s);
+               bch2_bset_fix_invalidated_key(l->b, _k);
+               bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                        _k, _k->u64s, _k->u64s);
+               verify_modified_extent(iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_ALL: {
-               struct bpos orig_pos = k.k->p;
-
                /* The insert key completely covers k, invalidate k */
                if (!bkey_whiteout(k.k))
-                       btree_keys_account_key_drop(&b->nr,
-                                               t - b->set, _k);
+                       btree_account_key_drop(l->b, _k);
 
                bch2_drop_subtract(s, k);
-               k.k->p = bkey_start_pos(&insert->k);
-               if (!__extent_save(b, node_iter, _k, k.k)) {
-                       /*
-                        * Couldn't repack: we aren't necessarily able
-                        * to repack if the new key is outside the range
-                        * of the old extent, so we have to split
-                        * @insert:
-                        */
-                       k.k->p = orig_pos;
-                       extent_save(b, node_iter, _k, k.k);
 
-                       ret = extent_insert_advance_pos(s, k.s_c);
-                       if (ret != BTREE_INSERT_OK)
-                               return ret;
+               if (_k >= btree_bset_last(l->b)->start) {
+                       unsigned u64s = _k->u64s;
 
-                       extent_insert_committed(s);
-                       /*
-                        * We split and inserted upto at k.k->p - that
-                        * has to coincide with iter->pos, so that we
-                        * don't have anything more we have to insert
-                        * until we recheck our journal reservation:
-                        */
-                       EBUG_ON(bkey_cmp(s->committed, k.k->p));
+                       bch2_bset_delete(l->b, _k, _k->u64s);
+                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                                _k, u64s, 0);
+                       bch2_btree_iter_verify(iter, l->b);
                } else {
-                       bch2_bset_fix_invalidated_key(b, t, _k);
-                       bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                               _k, _k->u64s, _k->u64s);
+                       extent_save(l->b, _k, k.k);
+                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                                _k, _k->u64s, _k->u64s);
+                       verify_modified_extent(iter, _k);
                }
 
                break;
@@ -1412,14 +1400,15 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
                 * what k points to)
                 */
                bkey_reassemble(&split.k, k.s_c);
-               split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+               split.k.k.needs_whiteout |= bkey_written(l->b, _k);
 
                bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
                BUG_ON(bkey_deleted(&split.k.k));
 
                bch2_cut_subtract_front(s, insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
+               verify_modified_extent(iter, _k);
 
                bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
                                bkey_start_offset(&split.k.k),
@@ -1428,158 +1417,96 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
                break;
        }
        }
-
-       return BTREE_INSERT_OK;
 }
 
-static enum btree_insert_ret
-__bch2_delete_fixup_extent(struct extent_insert_state *s)
+static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
-       struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
        struct bkey_packed *_k;
        struct bkey unpacked;
        struct bkey_i *insert = s->insert->k;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-       s->whiteout = *insert;
-       s->whiteout.k.type = KEY_TYPE_DISCARD;
 
        while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
+              (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+                                                     KEY_TYPE_DISCARD))) {
+               struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+               enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
                EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
 
                if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
                        break;
 
-               if (bkey_whiteout(k.k)) {
-                       s->committed = bpos_min(insert->k.p, k.k->p);
-                       goto next;
-               }
-
-               overlap = bch2_extent_overlap(&insert->k, k.k);
-
-               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-               if (ret)
-                       break;
-
-               ret = extent_insert_advance_pos(s, k.s_c);
-               if (ret)
-                       break;
-
-               s->do_journal = true;
-
-               if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-                       btree_keys_account_key_drop(&b->nr,
-                                               t - b->set, _k);
-                       bch2_subtract_sectors(s, k.s_c,
-                                            bkey_start_offset(k.k), k.k->size);
-                       _k->type = KEY_TYPE_DISCARD;
-                       reserve_whiteout(b, t, _k);
-               } else if (k.k->needs_whiteout ||
-                          bset_written(b, bset(b, t))) {
-                       struct bkey_i discard = *insert;
-
-                       discard.k.type = KEY_TYPE_DISCARD;
-
-                       switch (overlap) {
-                       case BCH_EXTENT_OVERLAP_FRONT:
-                               bch2_cut_front(bkey_start_pos(k.k), &discard);
-                               break;
-                       case BCH_EXTENT_OVERLAP_BACK:
-                               bch2_cut_back(k.k->p, &discard.k);
-                               break;
-                       default:
-                               break;
-                       }
+               s->committed = bpos_min(s->insert->k->k.p, k.k->p);
 
-                       discard.k.needs_whiteout = true;
-
-                       ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
+               if (!bkey_whiteout(k.k))
+                       s->update_journal = true;
 
-                       extent_bset_insert(c, iter, &discard);
-               } else {
-                       ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
+               if (!s->update_journal) {
+                       bch2_cut_front(s->committed, insert);
+                       bch2_cut_front(s->committed, &s->whiteout);
+                       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+                       goto next;
                }
-next:
-               bch2_cut_front(s->committed, insert);
-               bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-       }
-
-       return ret;
-}
-
-static enum btree_insert_ret
-__bch2_insert_fixup_extent(struct extent_insert_state *s)
-{
-       struct btree_iter *iter = s->insert->iter;
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
-       struct bkey_i *insert = s->insert->k;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-       while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
-
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
-                       break;
-
-               overlap = bch2_extent_overlap(&insert->k, k.k);
-
-               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-               if (ret)
-                       break;
-
-               if (!k.k->size)
-                       goto squash;
 
                /*
-                * Only call advance pos & call hook for nonzero size extents:
+                * When deleting, if possible just do it by switching the type
+                * of the key we're deleting, instead of creating and inserting
+                * a new whiteout:
                 */
-               ret = extent_insert_advance_pos(s, k.s_c);
-               if (ret)
+               if (s->deleting &&
+                   !s->update_btree &&
+                   !bkey_cmp(insert->k.p, k.k->p) &&
+                   !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+                       if (!bkey_whiteout(k.k)) {
+                               btree_account_key_drop(l->b, _k);
+                               bch2_subtract_sectors(s, k.s_c,
+                                                     bkey_start_offset(k.k), k.k->size);
+                               _k->type = KEY_TYPE_DISCARD;
+                               reserve_whiteout(l->b, _k);
+                       }
                        break;
+               }
 
-               if (k.k->size &&
-                   (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+               if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
                        insert->k.needs_whiteout = true;
+                       s->update_btree = true;
+               }
 
-               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+               if (s->update_btree &&
+                   overlap == BCH_EXTENT_OVERLAP_ALL &&
                    bkey_whiteout(k.k) &&
                    k.k->needs_whiteout) {
-                       unreserve_whiteout(b, t, _k);
+                       unreserve_whiteout(l->b, _k);
                        _k->needs_whiteout = false;
                }
-squash:
-               ret = extent_squash(s, insert, t, _k, k, overlap);
-               if (ret != BTREE_INSERT_OK)
+
+               extent_squash(s, insert, _k, k, overlap);
+
+               if (!s->update_btree)
+                       bch2_cut_front(s->committed, insert);
+next:
+               if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+                   overlap == BCH_EXTENT_OVERLAP_MIDDLE)
                        break;
        }
 
-       return ret;
+       if (bkey_cmp(s->committed, insert->k.p) < 0)
+               s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p);
+
+       /*
+        * may have skipped past some deleted extents greater than the insert
+        * key, before we got to a non deleted extent and knew we could bail out
+        * rewind the iterator a bit if necessary:
+        */
+       {
+               struct btree_node_iter node_iter = l->iter;
+
+               while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+                      bkey_cmp_left_packed(l->b, _k, &s->committed) > 0)
+                       l->iter = node_iter;
+       }
 }
 
 /**
@@ -1625,16 +1552,17 @@ enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *trans,
                         struct btree_insert_entry *insert)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter = insert->iter;
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
+       struct bch_fs *c        = trans->c;
+       struct btree_iter *iter = insert->iter;
+       struct btree *b         = iter->l[0].b;
        struct extent_insert_state s = {
                .trans          = trans,
                .insert         = insert,
-               .committed      = insert->iter->pos,
+               .committed      = iter->pos,
+
+               .whiteout       = *insert->k,
+               .update_journal = !bkey_whiteout(&insert->k->k),
+               .update_btree   = !bkey_whiteout(&insert->k->k),
                .deleting       = bkey_whiteout(&insert->k->k),
        };
 
@@ -1655,45 +1583,23 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
                                bkey_start_offset(&insert->k->k),
                                insert->k->k.size);
 
-       ret = !s.deleting
-               ? __bch2_insert_fixup_extent(&s)
-               : __bch2_delete_fixup_extent(&s);
-
-       if (ret == BTREE_INSERT_OK &&
-           bkey_cmp(s.committed, insert->k->k.p) < 0)
-               ret = extent_insert_advance_pos(&s, bkey_s_c_null);
+       __bch2_insert_fixup_extent(&s);
 
        extent_insert_committed(&s);
 
-       if (s.deleting)
-               bch2_cut_front(iter->pos, insert->k);
-
-       /*
-        * Subtract any remaining sectors from @insert, if we bailed out early
-        * and didn't fully insert @insert:
-        */
-       if (!s.deleting &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-           insert->k->k.size)
-               bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
-                                    bkey_start_offset(&insert->k->k),
-                                    insert->k->k.size);
-
        bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
                           gc_pos_btree_node(b));
 
        EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
        EBUG_ON(bkey_cmp(iter->pos, s.committed));
-       EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-               !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
-
-       if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-               ret = BTREE_INSERT_NEED_TRAVERSE;
 
-       WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
-                 "ret %u insert->k.size %u", ret, insert->k->k.size);
+       if (insert->k->k.size) {
+               /* got to the end of this leaf node */
+               BUG_ON(bkey_cmp(iter->pos, b->key.k.p));
+               return BTREE_INSERT_NEED_TRAVERSE;
+       }
 
-       return ret;
+       return BTREE_INSERT_OK;
 }
 
 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -1877,8 +1783,8 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k
        }
 }
 
-void bch2_extent_to_text(struct bch_fs *c, char *buf,
-                        size_t size, struct bkey_s_c k)
+int bch2_extent_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
 {
        char *out = buf, *end = buf + size;
        const char *invalid;
@@ -1892,6 +1798,7 @@ void bch2_extent_to_text(struct bch_fs *c, char *buf,
        if (invalid)
                p(" invalid: %s", invalid);
 #undef p
+       return out - buf;
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
@@ -2162,130 +2069,6 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
        return BCH_MERGE_MERGE;
 }
 
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
-                         struct bkey_i *src)
-{
-       struct bkey_format *f = &b->format;
-       struct bkey_i *dst_unpacked;
-
-       BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
-       /*
-        * We don't want the bch2_verify_key_order() call in extent_save(),
-        * because we may be out of order with deleted keys that are about to be
-        * removed by extent_bset_insert()
-        */
-
-       if ((dst_unpacked = packed_to_bkey(dst)))
-               bkey_copy(dst_unpacked, src);
-       else
-               BUG_ON(!bch2_bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
-                                        struct bpos new_pos,
-                                        struct bset_tree *t,
-                                        struct bkey_packed *k, struct bkey uk,
-                                        bool check, bool could_pack)
-{
-       struct btree_iter_level *l = &iter->l[0];
-
-       BUG_ON(!bkey_deleted(k));
-
-       if (check) {
-               return !bkey_packed(k) || could_pack;
-       } else {
-               uk.p = new_pos;
-               extent_save(l->b, &l->iter, k, &uk);
-               bch2_bset_fix_invalidated_key(l->b, t, k);
-               bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-                                        k, k->u64s, k->u64s);
-               return true;
-       }
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
-                                       struct bkey *m, bool back_merge)
-{
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bset_tree *t;
-       struct bkey_packed *k;
-       struct bkey uk;
-       struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
-       bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
-       bool check = true;
-
-       /*
-        * @m is the new merged extent:
-        *
-        * The merge took place in the last bset; we know there can't be any 0
-        * size extents overlapping with m there because if so they would have
-        * been between the two extents we merged.
-        *
-        * But in the other bsets, we have to check for and fix such extents:
-        */
-do_fixup:
-       for_each_bset(b, t) {
-               if (t == bset_tree_last(b))
-                       break;
-
-               /*
-                * if we don't find this bset in the iterator we already got to
-                * the end of that bset, so start searching from the end.
-                */
-               k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-               if (k == btree_bkey_last(b, t))
-                       k = bch2_bkey_prev_all(b, t, k);
-               if (!k)
-                       continue;
-
-               if (back_merge) {
-                       /*
-                        * Back merge: 0 size extents will be before the key
-                        * that was just inserted (and thus the iterator
-                        * position) - walk backwards to find them
-                        */
-                       for (;
-                            k &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
-                            k = bch2_bkey_prev_all(b, t, k)) {
-                               if (bkey_cmp(uk.p, m->p) >= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               } else {
-                       /* Front merge - walk forwards */
-                       for (;
-                            k != btree_bkey_last(b, t) &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, m->p) < 0);
-                            k = bkey_next(k)) {
-                               if (bkey_cmp(uk.p,
-                                            bkey_start_pos(m)) <= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               }
-       }
-
-       if (check) {
-               check = false;
-               goto do_fixup;
-       }
-
-       return true;
-}
-
 /*
  * When merging an extent that we're inserting into a btree node, the new merged
  * extent could overlap with an existing 0 size extent - if we don't fix that,
@@ -2302,13 +2085,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 {
        struct btree *b = iter->l[0].b;
        struct btree_node_iter *node_iter = &iter->l[0].iter;
-       const struct bkey_format *f = &b->format;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bkey_packed *m;
-       BKEY_PADDED(k) li;
-       BKEY_PADDED(k) ri;
-       struct bkey_i *mi;
-       struct bkey tmp;
+       BKEY_PADDED(k) li, ri;
+       struct bkey_packed *m   = back_merge ? l : r;
+       struct bkey_i *mi       = back_merge ? &li.k : &ri.k;
+       struct bset_tree *t     = bch2_bkey_to_bset(b, m);
+       enum merge_result ret;
+
+       EBUG_ON(bkey_written(b, m));
 
        /*
         * We need to save copies of both l and r, because we might get a
@@ -2317,57 +2100,49 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
        bch2_bkey_unpack(b, &li.k, l);
        bch2_bkey_unpack(b, &ri.k, r);
 
-       m = back_merge ? l : r;
-       mi = back_merge ? &li.k : &ri.k;
+       ret = bch2_extent_merge(c, b, &li.k, &ri.k);
+       if (ret == BCH_MERGE_NOMERGE)
+               return false;
 
-       /* l & r should be in last bset: */
-       EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+       /*
+        * check if we overlap with deleted extents - would break the sort
+        * order:
+        */
+       if (back_merge) {
+               struct bkey_packed *n = bkey_next(m);
 
-       switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
-       case BCH_MERGE_NOMERGE:
-               return false;
-       case BCH_MERGE_PARTIAL:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+               if (n != btree_bkey_last(b, t) &&
+                   bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
+                   bkey_deleted(n))
                        return false;
+       } else if (ret == BCH_MERGE_MERGE) {
+               struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+               if (prev &&
+                   bkey_cmp_left_packed_byval(b, prev,
+                               bkey_start_pos(&li.k.k)) > 0)
                        return false;
+       }
 
-               extent_i_save(b, m, mi);
-               bch2_bset_fix_invalidated_key(b, t, m);
-
-               /*
-                * Update iterator to reflect what we just inserted - otherwise,
-                * the iter_fix() call is going to put us _before_ the key we
-                * just partially merged with:
-                */
-               if (back_merge)
-                       bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
-
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
+       if (ret == BCH_MERGE_PARTIAL) {
+               if (!extent_i_save(b, m, mi))
+                       return false;
 
                if (!back_merge)
                        bkey_copy(packed_to_bkey(l), &li.k);
                else
                        bkey_copy(packed_to_bkey(r), &ri.k);
-               return false;
-       case BCH_MERGE_MERGE:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
-                       return false;
-
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+       } else {
+               if (!extent_i_save(b, m, &li.k))
                        return false;
+       }
 
-               extent_i_save(b, m, &li.k);
-               bch2_bset_fix_invalidated_key(b, t, m);
+       bch2_bset_fix_invalidated_key(b, m);
+       bch2_btree_node_iter_fix(iter, b, node_iter,
+                                m, m->u64s, m->u64s);
+       verify_modified_extent(iter, m);
 
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
-               return true;
-       default:
-               BUG();
-       }
+       return ret == BCH_MERGE_MERGE;
 }
 
 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
index 08ad9647240616749830c1fae7a7f19812086523..66a02f1c5e5b78de8b427fa03bd0018b20284f7e 100644 (file)
@@ -11,14 +11,13 @@ struct btree_node_iter;
 struct btree_node_iter_large;
 struct btree_insert;
 struct btree_insert_entry;
-struct extent_insert_hook;
 struct bch_devs_mask;
 union bch_extent_crc;
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
                               struct bkey_s_c);
-void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 #define bch2_bkey_btree_ops (struct bkey_ops) {                        \
@@ -30,7 +29,7 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
                                    struct bkey_i *, struct bkey_i *);
@@ -61,9 +60,22 @@ int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
                         struct bch_devs_mask *,
                         struct extent_pick_ptr *);
 
+void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+
+static inline bool bch2_extent_is_atomic(struct bkey *k,
+                                        struct btree_iter *iter)
+{
+       struct btree *b = iter->l[0].b;
+
+       return bkey_cmp(k->p, b->key.k.p) <= 0 &&
+               bkey_cmp(bkey_start_pos(k), b->data->min_key) >= 0;
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *, struct btree_insert_entry *,
+                      unsigned *);
 enum btree_insert_ret
-bch2_insert_fixup_extent(struct btree_insert *,
-                       struct btree_insert_entry *);
+bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
index 789ae663bcbf1630730158abe8f0d2835dc0973f..085d828e6ed154e0cb7501a82edb201859655fdc 100644 (file)
@@ -108,17 +108,17 @@ do {                                                                      \
 #define fifo_peek(fifo)                fifo_peek_front(fifo)
 
 #define fifo_for_each_entry(_entry, _fifo, _iter)                      \
-       for (((void) (&(_iter) == &(_fifo)->front)),                    \
-            _iter = (_fifo)->front;                                    \
+       for (typecheck(typeof((_fifo)->front), _iter),                  \
+            (_iter) = (_fifo)->front;                                  \
             ((_iter != (_fifo)->back) &&                               \
              (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
-            _iter++)
+            (_iter)++)
 
 #define fifo_for_each_entry_ptr(_ptr, _fifo, _iter)                    \
-       for (((void) (&(_iter) == &(_fifo)->front)),                    \
-            _iter = (_fifo)->front;                                    \
+       for (typecheck(typeof((_fifo)->front), _iter),                  \
+            (_iter) = (_fifo)->front;                                  \
             ((_iter != (_fifo)->back) &&                               \
              (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true));  \
-            _iter++)
+            (_iter)++)
 
 #endif /* _BCACHEFS_FIFO_H */
index e4d2b39e0d8271a69bf88795c99023ed497565f4..d4384303a9a1d9c65ed6eee4f62c94ae6ccd52f1 100644 (file)
@@ -5,6 +5,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "error.h"
+#include "extents.h"
 #include "fs.h"
 #include "fs-io.h"
 #include "fsck.h"
@@ -32,16 +33,6 @@ struct quota_res {
        u64                             sectors;
 };
 
-struct i_sectors_hook {
-       struct extent_insert_hook       hook;
-       struct bch_inode_info           *inode;
-       struct quota_res                quota_res;
-       s64                             sectors;
-       u64                             new_i_size;
-       unsigned                        flags;
-       unsigned                        appending:1;
-};
-
 struct bchfs_write_op {
        struct bch_inode_info           *inode;
        s64                             sectors_added;
@@ -177,28 +168,48 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 
 /* i_size updates: */
 
+struct inode_new_size {
+       loff_t          new_size;
+       u64             now;
+       unsigned        fields;
+};
+
 static int inode_set_size(struct bch_inode_info *inode,
                          struct bch_inode_unpacked *bi,
                          void *p)
 {
-       loff_t *new_i_size = p;
+       struct inode_new_size *s = p;
 
-       lockdep_assert_held(&inode->ei_update_lock);
+       bi->bi_size = s->new_size;
+       if (s->fields & ATTR_ATIME)
+               bi->bi_atime = s->now;
+       if (s->fields & ATTR_MTIME)
+               bi->bi_mtime = s->now;
+       if (s->fields & ATTR_CTIME)
+               bi->bi_ctime = s->now;
 
-       bi->bi_size = *new_i_size;
        return 0;
 }
 
 static int __must_check bch2_write_inode_size(struct bch_fs *c,
                                              struct bch_inode_info *inode,
-                                             loff_t new_size)
+                                             loff_t new_size, unsigned fields)
 {
-       return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0);
+       struct inode_new_size s = {
+               .new_size       = new_size,
+               .now            = bch2_current_time(c),
+               .fields         = fields,
+       };
+
+       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
 static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, int sectors)
+                          struct quota_res *quota_res, s64 sectors)
 {
+       if (!sectors)
+               return;
+
        mutex_lock(&inode->ei_quota_lock);
 #ifdef CONFIG_BCACHEFS_QUOTA
        if (quota_res && sectors > 0) {
@@ -215,297 +226,191 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
        mutex_unlock(&inode->ei_quota_lock);
 }
 
-/* i_sectors accounting: */
-
-static enum btree_insert_ret
-i_sectors_hook_fn(struct extent_insert_hook *hook,
-                 struct bpos committed_pos,
-                 struct bpos next_pos,
-                 struct bkey_s_c k,
-                 const struct bkey_i *insert)
-{
-       struct i_sectors_hook *h = container_of(hook,
-                               struct i_sectors_hook, hook);
-       s64 sectors = next_pos.offset - committed_pos.offset;
-       int sign = bkey_extent_is_allocation(&insert->k) -
-               (k.k && bkey_extent_is_allocation(k.k));
-
-       EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
-
-       h->sectors += sectors * sign;
-
-       return BTREE_INSERT_OK;
-}
-
-static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
-                                    struct bch_inode_unpacked *bi,
-                                    void *p)
-{
-       struct i_sectors_hook *h = p;
-
-       if (h->new_i_size != U64_MAX &&
-           (!h->appending ||
-            h->new_i_size > bi->bi_size))
-               bi->bi_size = h->new_i_size;
-       bi->bi_sectors  += h->sectors;
-       bi->bi_flags    &= ~h->flags;
-       return 0;
-}
+/* normal i_size/i_sectors update machinery: */
 
-static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
+static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
+                                bool *allocating)
 {
-       int ret;
-
-       mutex_lock(&h->inode->ei_update_lock);
-       i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+       struct btree_iter iter;
+       struct bkey_s_c old;
+       s64 delta = 0;
 
-       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0);
+       bch2_btree_iter_init(&iter, _iter->c, BTREE_ID_EXTENTS, POS_MIN,
+                            BTREE_ITER_SLOTS);
 
-       if (!ret && h->new_i_size != U64_MAX)
-               i_size_write(&h->inode->v, h->new_i_size);
-       mutex_unlock(&h->inode->ei_update_lock);
+       bch2_btree_iter_link(_iter, &iter);
+       bch2_btree_iter_copy(&iter, _iter);
 
-       bch2_quota_reservation_put(c, h->inode, &h->quota_res);
+       for_each_btree_key_continue(&iter, BTREE_ITER_SLOTS, old) {
+               if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
+                       break;
 
-       h->sectors = 0;
+               if (allocating &&
+                   !bch2_extent_is_fully_allocated(old))
+                       *allocating = true;
 
-       return ret;
-}
-
-static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
-                                   struct bch_inode_unpacked *bi, void *p)
-{
-       struct i_sectors_hook *h = p;
+               delta += (min(new->k.p.offset,
+                             old.k->p.offset) -
+                         max(bkey_start_offset(&new->k),
+                             bkey_start_offset(old.k))) *
+                       (bkey_extent_is_allocation(&new->k) -
+                        bkey_extent_is_allocation(old.k));
+       }
 
-       if (h->flags & BCH_INODE_I_SIZE_DIRTY)
-               bi->bi_size = h->new_i_size;
+       bch2_btree_iter_unlink(&iter);
 
-       bi->bi_flags |= h->flags;
-       return 0;
+       return delta;
 }
 
-static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+static int bch2_extent_update(struct btree_trans *trans,
+                             struct bch_inode_info *inode,
+                             struct disk_reservation *disk_res,
+                             struct quota_res *quota_res,
+                             struct btree_iter *extent_iter,
+                             struct bkey_i *k,
+                             u64 new_i_size,
+                             bool may_allocate,
+                             bool direct,
+                             s64 *total_delta)
 {
+       struct btree_iter *inode_iter = NULL;
+       struct bch_inode_unpacked inode_u;
+       struct bkey_inode_buf inode_p;
+       bool allocating = false;
+       bool extended = false;
+       s64 i_sectors_delta;
        int ret;
 
-       mutex_lock(&h->inode->ei_update_lock);
-       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0);
-       mutex_unlock(&h->inode->ei_update_lock);
-
-       return ret;
-}
+       bch2_trans_begin_updates(trans);
 
-static inline struct i_sectors_hook
-i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
-{
-       return (struct i_sectors_hook) {
-               .hook.fn        = i_sectors_hook_fn,
-               .inode          = inode,
-               .sectors        = 0,
-               .new_i_size     = U64_MAX,
-               .flags          = flags|BCH_INODE_I_SECTORS_DIRTY,
-       };
-}
-
-/* normal i_size/i_sectors update machinery: */
+       ret = bch2_btree_iter_traverse(extent_iter);
+       if (ret)
+               return ret;
 
-struct bchfs_extent_trans_hook {
-       struct bchfs_write_op           *op;
-       struct extent_insert_hook       hook;
+       bch2_extent_trim_atomic(k, extent_iter);
 
-       struct bch_inode_unpacked       inode_u;
-       struct bkey_inode_buf           inode_p;
+       i_sectors_delta = sum_sector_overwrites(k, extent_iter, &allocating);
+       if (!may_allocate && allocating)
+               return -ENOSPC;
 
-       bool                            need_inode_update;
-};
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k));
 
-static enum btree_insert_ret
-bchfs_extent_update_hook(struct extent_insert_hook *hook,
-                        struct bpos committed_pos,
-                        struct bpos next_pos,
-                        struct bkey_s_c k,
-                        const struct bkey_i *insert)
-{
-       struct bchfs_extent_trans_hook *h = container_of(hook,
-                               struct bchfs_extent_trans_hook, hook);
-       struct bch_inode_info *inode = h->op->inode;
-       int sign = bkey_extent_is_allocation(&insert->k) -
-               (k.k && bkey_extent_is_allocation(k.k));
-       s64 sectors = (s64) (next_pos.offset - committed_pos.offset) * sign;
-       u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
-       bool do_pack = false;
+       new_i_size = min(k->k.p.offset << 9, new_i_size);
 
-       if (h->op->unalloc &&
-           !bch2_extent_is_fully_allocated(k))
-               return BTREE_INSERT_ENOSPC;
+       /* XXX: inode->i_size locking */
+       if (i_sectors_delta ||
+           new_i_size > inode->ei_inode.bi_size) {
+               inode_iter = bch2_trans_get_iter(trans,
+                       BTREE_ID_INODES,
+                       POS(k->k.p.inode, 0),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+               if (IS_ERR(inode_iter))
+                       return PTR_ERR(inode_iter);
+
+               ret = bch2_btree_iter_traverse(inode_iter);
+               if (ret)
+                       goto err;
 
-       BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
+               inode_u = inode->ei_inode;
+               inode_u.bi_sectors += i_sectors_delta;
 
-       /* XXX: inode->i_size locking */
-       if (offset > inode->ei_inode.bi_size) {
-               if (!h->need_inode_update) {
-                       h->need_inode_update = true;
-                       return BTREE_INSERT_NEED_TRAVERSE;
+               /* XXX: this is slightly suspect */
+               if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                   new_i_size > inode_u.bi_size) {
+                       inode_u.bi_size = new_i_size;
+                       extended = true;
                }
 
-               /* truncate in progress? */
-               if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
-                       goto no_i_size_update;
+               bch2_inode_pack(&inode_p, &inode_u);
+               bch2_trans_update(trans,
+                       BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i));
+       }
 
-               h->inode_u.bi_size = offset;
-               do_pack = true;
+       ret = bch2_trans_commit(trans, disk_res,
+                               &inode->ei_journal_seq,
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_ATOMIC|
+                               BTREE_INSERT_NOUNLOCK|
+                               BTREE_INSERT_USE_RESERVE);
+       if (ret)
+               goto err;
 
-               inode->ei_inode.bi_size = offset;
+       inode->ei_inode.bi_sectors += i_sectors_delta;
 
-               spin_lock(&inode->v.i_lock);
-               if (offset > inode->v.i_size) {
-                       if (h->op->is_dio)
-                               i_size_write(&inode->v, offset);
-                       else
-                               BUG();
-               }
-               spin_unlock(&inode->v.i_lock);
-       }
-no_i_size_update:
-       if (sectors) {
-               if (!h->need_inode_update) {
-                       h->need_inode_update = true;
-                       return BTREE_INSERT_NEED_TRAVERSE;
-               }
+       EBUG_ON(i_sectors_delta &&
+               inode->ei_inode.bi_sectors != inode_u.bi_sectors);
 
-               h->inode_u.bi_sectors += sectors;
-               do_pack = true;
+       if (extended) {
+               inode->ei_inode.bi_size = new_i_size;
 
-               h->op->sectors_added += sectors;
+               if (direct) {
+                       spin_lock(&inode->v.i_lock);
+                       if (new_i_size > inode->v.i_size)
+                               i_size_write(&inode->v, new_i_size);
+                       spin_unlock(&inode->v.i_lock);
+               }
        }
 
-       if (do_pack)
-               bch2_inode_pack(&h->inode_p, &h->inode_u);
+       if (direct)
+               i_sectors_acct(trans->c, inode, quota_res, i_sectors_delta);
 
-       return BTREE_INSERT_OK;
+       if (total_delta)
+               *total_delta += i_sectors_delta;
+err:
+       if (!IS_ERR_OR_NULL(inode_iter))
+               bch2_trans_iter_put(trans, inode_iter);
+       return ret;
 }
 
 static int bchfs_write_index_update(struct bch_write_op *wop)
 {
        struct bchfs_write_op *op = container_of(wop,
                                struct bchfs_write_op, op);
+       struct quota_res *quota_res = op->is_dio
+               ? &container_of(op, struct dio_write, iop)->quota_res
+               : NULL;
+       struct bch_inode_info *inode = op->inode;
        struct keylist *keys = &op->op.insert_keys;
-       struct btree_trans trans;
-       struct btree_iter *extent_iter, *inode_iter = NULL;
-       struct bchfs_extent_trans_hook hook;
        struct bkey_i *k = bch2_keylist_front(keys);
-       s64 orig_sectors_added = op->sectors_added;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        int ret;
 
-       BUG_ON(k->k.p.inode != op->inode->v.i_ino);
+       BUG_ON(k->k.p.inode != inode->v.i_ino);
 
        bch2_trans_init(&trans, wop->c);
+       bch2_trans_preload_iters(&trans);
 
-       extent_iter = bch2_trans_get_iter(&trans,
+       iter = bch2_trans_get_iter(&trans,
                                BTREE_ID_EXTENTS,
-                               bkey_start_pos(&bch2_keylist_front(keys)->k),
+                               bkey_start_pos(&k->k),
                                BTREE_ITER_INTENT);
-       BUG_ON(IS_ERR(extent_iter));
-
-       hook.op                 = op;
-       hook.hook.fn            = bchfs_extent_update_hook;
-       hook.need_inode_update  = false;
 
        do {
-               /* XXX: inode->i_size locking */
-               k = bch2_keylist_front(keys);
-               if (min(k->k.p.offset << 9, op->new_i_size) >
-                   op->inode->ei_inode.bi_size)
-                       hook.need_inode_update = true;
-
-               /* optimization for fewer transaction restarts: */
-               ret = bch2_btree_iter_traverse(extent_iter);
-               if (ret)
-                       goto err;
-
-               if (hook.need_inode_update) {
-                       struct bkey_s_c inode;
-
-                       if (!inode_iter) {
-                               inode_iter = bch2_trans_get_iter(&trans,
-                                       BTREE_ID_INODES,
-                                       POS(extent_iter->pos.inode, 0),
-                                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-                               BUG_ON(IS_ERR(inode_iter));
-                       }
-
-                       inode = bch2_btree_iter_peek_slot(inode_iter);
-                       if ((ret = btree_iter_err(inode)))
-                               goto err;
-
-                       if (WARN_ONCE(inode.k->type != BCH_INODE_FS,
-                                     "inode %llu not found when updating",
-                                     extent_iter->pos.inode)) {
-                               ret = -ENOENT;
-                               break;
-                       }
-
-                       if (WARN_ONCE(bkey_bytes(inode.k) >
-                                     sizeof(hook.inode_p),
-                                     "inode %llu too big (%zu bytes, buf %zu)",
-                                     extent_iter->pos.inode,
-                                     bkey_bytes(inode.k),
-                                     sizeof(hook.inode_p))) {
-                               ret = -ENOENT;
-                               break;
-                       }
-
-                       bkey_reassemble(&hook.inode_p.inode.k_i, inode);
-                       ret = bch2_inode_unpack(bkey_s_c_to_inode(inode),
-                                              &hook.inode_u);
-                       if (WARN_ONCE(ret,
-                                     "error %i unpacking inode %llu",
-                                     ret, extent_iter->pos.inode)) {
-                               ret = -ENOENT;
-                               break;
-                       }
-
-                       ret = bch2_btree_insert_at(wop->c, &wop->res,
-                                       &hook.hook, op_journal_seq(wop),
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_ATOMIC|
-                                       BTREE_INSERT_USE_RESERVE,
-                                       BTREE_INSERT_ENTRY(extent_iter, k),
-                                       BTREE_INSERT_ENTRY_EXTRA_RES(inode_iter,
-                                                       &hook.inode_p.inode.k_i, 2));
-               } else {
-                       ret = bch2_btree_insert_at(wop->c, &wop->res,
-                                       &hook.hook, op_journal_seq(wop),
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_ATOMIC|
-                                       BTREE_INSERT_USE_RESERVE,
-                                       BTREE_INSERT_ENTRY(extent_iter, k));
-               }
+               BKEY_PADDED(k) tmp;
 
-               BUG_ON(bkey_cmp(extent_iter->pos, bkey_start_pos(&k->k)));
+               bkey_copy(&tmp.k, bch2_keylist_front(keys));
 
-               if (WARN_ONCE(!ret != !k->k.size,
-                             "ret %i k->size %u", ret, k->k.size))
-                       ret = k->k.size ? -EINTR : 0;
-err:
+               ret = bch2_extent_update(&trans, inode,
+                               &wop->res, quota_res,
+                               iter, &tmp.k,
+                               op->new_i_size,
+                               !op->unalloc,
+                               op->is_dio,
+                               &op->sectors_added);
                if (ret == -EINTR)
                        continue;
                if (ret)
                        break;
 
-               BUG_ON(bkey_cmp(extent_iter->pos, k->k.p) < 0);
-               bch2_keylist_pop_front(keys);
+               if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
+                       bch2_cut_front(iter->pos, bch2_keylist_front(keys));
+               else
+                       bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
        bch2_trans_exit(&trans);
 
-       if (op->is_dio) {
-               struct dio_write *dio = container_of(op, struct dio_write, iop);
-
-               i_sectors_acct(wop->c, op->inode, &dio->quota_res,
-                              op->sectors_added - orig_sectors_added);
-       }
-
        return ret;
 }
 
@@ -828,17 +733,6 @@ static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
                bio_end_sector(bio) == offset;
 }
 
-static void __bio_add_page(struct bio *bio, struct page *page)
-{
-       bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
-               .bv_page = page,
-               .bv_len = PAGE_SIZE,
-               .bv_offset = 0,
-       };
-
-       bio->bi_iter.bi_size += PAGE_SIZE;
-}
-
 static int bio_add_page_contig(struct bio *bio, struct page *page)
 {
        sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
@@ -850,7 +744,7 @@ static int bio_add_page_contig(struct bio *bio, struct page *page)
        else if (!bio_can_add_page_contig(bio, page))
                return -1;
 
-       __bio_add_page(bio, page);
+       __bio_add_page(bio, page, PAGE_SIZE, 0);
        return 0;
 }
 
@@ -974,7 +868,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
                        iter->nr_pages--;
                } else if (get_more) {
                        rcu_read_lock();
-                       page = radix_tree_lookup(&iter->mapping->page_tree, page_offset);
+                       page = radix_tree_lookup(&iter->mapping->i_pages, page_offset);
                        rcu_read_unlock();
 
                        if (page && !radix_tree_exceptional_entry(page))
@@ -994,7 +888,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
                if (ret)
                        break;
 
-               __bio_add_page(bio, page);
+               __bio_add_page(bio, page, PAGE_SIZE, 0);
        }
 
        if (!iter->nr_pages)
@@ -2068,7 +1962,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret;
+       int ret, ret2;
 
        ret = file_write_and_wait_range(file, start, end);
        if (ret)
@@ -2084,11 +1978,63 @@ out:
        if (c->opts.journal_flush_disabled)
                return 0;
 
-       return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+       ret = bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
+       ret2 = file_check_and_advance_wb_err(file);
+
+       return ret ?: ret2;
 }
 
 /* truncate: */
 
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+                        u64 start_offset, u64 end_offset, u64 *journal_seq)
+{
+       struct bpos start       = POS(inode->v.i_ino, start_offset);
+       struct bpos end         = POS(inode->v.i_ino, end_offset);
+       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c);
+       bch2_trans_preload_iters(&trans);
+
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+                                  BTREE_ITER_INTENT);
+
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = btree_iter_err(k)) &&
+              bkey_cmp(iter->pos, end) < 0) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct bkey_i delete;
+
+               bkey_init(&delete.k);
+               delete.k.p = iter->pos;
+
+               /* create the biggest key we can */
+               bch2_key_resize(&delete.k, max_sectors);
+               bch2_cut_back(end, &delete.k);
+
+               ret = bch2_extent_update(&trans, inode,
+                               &disk_res, NULL, iter, &delete,
+                               0, true, true, NULL);
+               bch2_disk_reservation_put(c, &disk_res);
+
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       break;
+
+               bch2_btree_iter_cond_resched(iter);
+       }
+
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
 static inline int range_has_data(struct bch_fs *c,
                                  struct bpos start,
                                  struct bpos end)
@@ -2203,19 +2149,39 @@ static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
        setattr_copy(&inode->v, iattr);
 
        mutex_lock(&inode->ei_update_lock);
-       inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+       ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+                                   ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
        return ret;
 }
 
+static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
+                                  struct bch_inode_unpacked *bi,
+                                  void *p)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+       bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
+       return 0;
+}
+
+static int bch2_truncate_start_fn(struct bch_inode_info *inode,
+                                 struct bch_inode_unpacked *bi, void *p)
+{
+       u64 *new_i_size = p;
+
+       bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
+       bi->bi_size = *new_i_size;
+       return 0;
+}
+
 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct i_sectors_hook i_sectors_hook =
-               i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+       u64 new_i_size = iattr->ia_size;
        bool shrink;
        int ret = 0;
 
@@ -2228,12 +2194,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
        if (!shrink) {
                ret = bch2_extend(inode, iattr);
-               goto err_put_pagecache;
+               goto err;
        }
 
        ret = bch2_truncate_page(inode, iattr->ia_size);
        if (unlikely(ret))
-               goto err_put_pagecache;
+               goto err;
 
        if (iattr->ia_size > inode->ei_inode.bi_size)
                ret = filemap_write_and_wait_range(mapping,
@@ -2244,37 +2210,37 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
                                round_down(iattr->ia_size, PAGE_SIZE),
                                iattr->ia_size - 1);
        if (ret)
-               goto err_put_pagecache;
+               goto err;
 
-       i_sectors_hook.new_i_size = iattr->ia_size;
+       mutex_lock(&inode->ei_update_lock);
+       ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
+                              &new_i_size, 0);
+       mutex_unlock(&inode->ei_update_lock);
 
-       ret = i_sectors_dirty_start(c, &i_sectors_hook);
        if (unlikely(ret))
-               goto err_put_pagecache;
+               goto err;
 
        truncate_setsize(&inode->v, iattr->ia_size);
 
-       ret = bch2_inode_truncate(c, inode->v.i_ino,
-                                 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-                                 &i_sectors_hook.hook,
-                                 &inode->ei_journal_seq);
+       /*
+        * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
+        * here:
+        */
+       ret = __bch2_fpunch(c, inode,
+                       round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+                       U64_MAX, &inode->ei_journal_seq);
        if (unlikely(ret))
-               goto err_put_sectors_dirty;
+               goto err;
 
        setattr_copy(&inode->v, iattr);
-       inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-out:
-       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-err_put_pagecache:
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
+                              ATTR_MTIME|ATTR_CTIME);
+       mutex_unlock(&inode->ei_update_lock);
+err:
        pagecache_block_put(&mapping->add_lock);
        return ret;
-err_put_sectors_dirty:
-       /*
-        * On error - in particular, bch2_truncate_page() error - don't clear
-        * I_SIZE_DIRTY, as we've left data above i_size!:
-        */
-       i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
-       goto out;
 }
 
 /* fallocate: */
@@ -2283,7 +2249,6 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       u64 ino = inode->v.i_ino;
        u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
        u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
        int ret = 0;
@@ -2309,34 +2274,9 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 
        truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
-       if (discard_start < discard_end) {
-               /*
-                * We need to pass in a disk reservation here because we might
-                * be splitting a compressed extent into two. This isn't a
-                * problem with truncate because truncate will never split an
-                * extent, only truncate it...
-                */
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct i_sectors_hook i_sectors_hook =
-                       i_sectors_hook_init(inode, 0);
-               int ret;
-
-               ret = i_sectors_dirty_start(c, &i_sectors_hook);
-               if (unlikely(ret))
-                       goto err;
-
-               ret = bch2_btree_delete_range(c,
-                               BTREE_ID_EXTENTS,
-                               POS(ino, discard_start),
-                               POS(ino, discard_end),
-                               ZERO_VERSION,
-                               &disk_res,
-                               &i_sectors_hook.hook,
-                               &inode->ei_journal_seq);
-
-               ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
-       }
+       if (discard_start < discard_end)
+               ret = __bch2_fpunch(c, inode, discard_start, discard_end,
+                                   &inode->ei_journal_seq);
 err:
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
@@ -2353,7 +2293,6 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        struct btree_iter *src, *dst;
        BKEY_PADDED(k) copy;
        struct bkey_s_c k;
-       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
        loff_t new_size;
        int ret;
 
@@ -2361,16 +2300,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                return -EINVAL;
 
        bch2_trans_init(&trans, c);
-
-       dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-                            POS(inode->v.i_ino, offset >> 9),
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       BUG_ON(IS_ERR(dst));
-
-       /* position will be set from dst iter's position: */
-       src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
-                            BTREE_ITER_SLOTS);
-       BUG_ON(IS_ERR(src));
+       bch2_trans_preload_iters(&trans);
 
        /*
         * We need i_mutex to keep the page cache consistent with the extents
@@ -2395,15 +2325,24 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        if (ret)
                goto err;
 
-       ret = i_sectors_dirty_start(c, &i_sectors_hook);
-       if (ret)
-               goto err;
+       dst = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                       POS(inode->v.i_ino, offset >> 9),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       BUG_ON(IS_ERR_OR_NULL(dst));
+
+       src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                       POS_MIN, BTREE_ITER_SLOTS);
+       BUG_ON(IS_ERR_OR_NULL(src));
 
        while (bkey_cmp(dst->pos,
                        POS(inode->v.i_ino,
                            round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
                struct disk_reservation disk_res;
 
+               ret = bch2_btree_iter_traverse(dst);
+               if (ret)
+                       goto btree_iter_err;
+
                bch2_btree_iter_set_pos(src,
                        POS(dst->pos.inode, dst->pos.offset + (len >> 9)));
 
@@ -2416,6 +2355,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                bch2_cut_front(src->pos, &copy.k);
                copy.k.k.p.offset -= len >> 9;
 
+               bch2_extent_trim_atomic(&copy.k, dst);
+
                BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
                ret = bch2_disk_reservation_get(c, &disk_res, copy.k.k.size,
@@ -2423,19 +2364,16 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                                BCH_DISK_RESERVATION_NOFAIL);
                BUG_ON(ret);
 
-               ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-                                          &inode->ei_journal_seq,
-                                          BTREE_INSERT_ATOMIC|
-                                          BTREE_INSERT_NOFAIL,
-                                          BTREE_INSERT_ENTRY(dst, &copy.k));
+               ret = bch2_extent_update(&trans, inode,
+                               &disk_res, NULL,
+                               dst, &copy.k,
+                               0, true, true, NULL);
                bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
                if (ret == -EINTR)
                        ret = 0;
-               if (ret) {
-                       bch2_trans_exit(&trans);
-                       goto err_put_sectors_dirty;
-               }
+               if (ret)
+                       goto err;
                /*
                 * XXX: if we error here we've left data with multiple
                 * pointers... which isn't a _super_ serious problem...
@@ -2443,20 +2381,21 @@ btree_iter_err:
 
                bch2_btree_iter_cond_resched(src);
        }
+       bch2_trans_unlock(&trans);
 
-       bch2_trans_exit(&trans);
-
-       ret = bch2_inode_truncate(c, inode->v.i_ino,
-                                round_up(new_size, block_bytes(c)) >> 9,
-                                &i_sectors_hook.hook,
-                                &inode->ei_journal_seq);
+       ret = __bch2_fpunch(c, inode,
+                       round_up(new_size, block_bytes(c)) >> 9,
+                       U64_MAX, &inode->ei_journal_seq);
        if (ret)
-               goto err_put_sectors_dirty;
+               goto err;
 
-       i_sectors_hook.new_i_size = new_size;
-err_put_sectors_dirty:
-       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+       i_size_write(&inode->v, new_size);
+       mutex_lock(&inode->ei_update_lock);
+       ret = bch2_write_inode_size(c, inode, new_size,
+                                   ATTR_MTIME|ATTR_CTIME);
+       mutex_unlock(&inode->ei_update_lock);
 err:
+       bch2_trans_exit(&trans);
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
        return ret;
@@ -2467,8 +2406,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 {
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
-       struct btree_iter iter;
+       struct btree_trans trans;
+       struct btree_iter *iter;
        struct bpos end_pos;
        loff_t block_start, block_end;
        loff_t end = offset + len;
@@ -2476,8 +2415,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
        unsigned replicas = io_opts(c, inode).data_replicas;
        int ret;
 
-       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_init(&trans, c);
+       bch2_trans_preload_iters(&trans);
 
        inode_lock(&inode->v);
        inode_dio_wait(&inode->v);
@@ -2512,34 +2451,32 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                block_end       = round_up(end, PAGE_SIZE);
        }
 
-       bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                       POS(inode->v.i_ino, block_start >> 9),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-       ret = i_sectors_dirty_start(c, &i_sectors_hook);
-       if (unlikely(ret))
-               goto err;
-
-       while (bkey_cmp(iter.pos, end_pos) < 0) {
+       while (bkey_cmp(iter->pos, end_pos) < 0) {
                struct disk_reservation disk_res = { 0 };
+               struct quota_res quota_res = { 0 };
                struct bkey_i_reservation reservation;
                struct bkey_s_c k;
 
-               k = bch2_btree_iter_peek_slot(&iter);
+               k = bch2_btree_iter_peek_slot(iter);
                if ((ret = btree_iter_err(k)))
                        goto btree_iter_err;
 
                /* already reserved */
                if (k.k->type == BCH_RESERVATION &&
                    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-                       bch2_btree_iter_next_slot(&iter);
+                       bch2_btree_iter_next_slot(iter);
                        continue;
                }
 
-               if (bkey_extent_is_data(k.k)) {
-                       if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-                               bch2_btree_iter_next_slot(&iter);
-                               continue;
-                       }
+               if (bkey_extent_is_data(k.k) &&
+                   !(mode & FALLOC_FL_ZERO_RANGE)) {
+                       bch2_btree_iter_next_slot(iter);
+                       continue;
                }
 
                bkey_reservation_init(&reservation.k_i);
@@ -2547,7 +2484,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                reservation.k.p         = k.k->p;
                reservation.k.size      = k.k->size;
 
-               bch2_cut_front(iter.pos, &reservation.k_i);
+               bch2_cut_front(iter->pos, &reservation.k_i);
                bch2_cut_back(end_pos, &reservation.k);
 
                sectors = reservation.k.size;
@@ -2555,7 +2492,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 
                if (!bkey_extent_is_allocation(k.k)) {
                        ret = bch2_quota_reservation_add(c, inode,
-                                       &i_sectors_hook.quota_res,
+                                       &quota_res,
                                        sectors, true);
                        if (unlikely(ret))
                                goto btree_iter_err;
@@ -2571,31 +2508,27 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                        reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
-               ret = bch2_btree_insert_at(c, &disk_res, &i_sectors_hook.hook,
-                                         &inode->ei_journal_seq,
-                                         BTREE_INSERT_ATOMIC|
-                                         BTREE_INSERT_NOFAIL,
-                                         BTREE_INSERT_ENTRY(&iter, &reservation.k_i));
+               ret = bch2_extent_update(&trans, inode,
+                               &disk_res, &quota_res,
+                               iter, &reservation.k_i,
+                               0, true, true, NULL);
+
+               bch2_quota_reservation_put(c, inode, &quota_res);
                bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
                if (ret == -EINTR)
                        ret = 0;
-               if (ret) {
-                       bch2_btree_iter_unlock(&iter);
-                       goto err_put_sectors_dirty;
-               }
-
+               if (ret)
+                       goto err;
        }
-       bch2_btree_iter_unlock(&iter);
-
-       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+       bch2_trans_unlock(&trans);
 
        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
            end > inode->v.i_size) {
                i_size_write(&inode->v, end);
 
                mutex_lock(&inode->ei_update_lock);
-               ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+               ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0);
                mutex_unlock(&inode->ei_update_lock);
        }
 
@@ -2611,18 +2544,13 @@ btree_iter_err:
 
                if (inode->ei_inode.bi_size != inode->v.i_size) {
                        mutex_lock(&inode->ei_update_lock);
-                       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+                       ret = bch2_write_inode_size(c, inode,
+                                                   inode->v.i_size, 0);
                        mutex_unlock(&inode->ei_update_lock);
                }
        }
-
-       pagecache_block_put(&mapping->add_lock);
-       inode_unlock(&inode->v);
-
-       return 0;
-err_put_sectors_dirty:
-       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err:
+       bch2_trans_exit(&trans);
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
        return ret;
index 336dbd4ba8d6b8554328e6c6f01ff9754978779c..0eb0a0112a84f8c8a4daf7e5d42a4eba46acb924 100644 (file)
 
 #define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
 
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
-       [__BCH_INODE_SYNC]      = S_SYNC,
-       [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
-       [__BCH_INODE_APPEND]    = S_APPEND,
-       [__BCH_INODE_NOATIME]   = S_NOATIME,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
-       [__BCH_INODE_SYNC]      = FS_SYNC_FL,
-       [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
-       [__BCH_INODE_APPEND]    = FS_APPEND_FL,
-       [__BCH_INODE_NODUMP]    = FS_NODUMP_FL,
-       [__BCH_INODE_NOATIME]   = FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
-       [__BCH_INODE_SYNC]      = FS_XFLAG_SYNC,
-       [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
-       [__BCH_INODE_APPEND]    = FS_XFLAG_APPEND,
-       [__BCH_INODE_NODUMP]    = FS_XFLAG_NODUMP,
-       [__BCH_INODE_NOATIME]   = FS_XFLAG_NOATIME,
-       //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out)                                     \
-do {                                                                   \
-       unsigned _i;                                                    \
-                                                                       \
-       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
-               if ((_in) & (1 << _i))                                  \
-                       (_out) |= _map[_i];                             \
-               else                                                    \
-                       (_out) &= ~_map[_i];                            \
-} while (0)
-
-#define map_flags(_map, _in)                                           \
-({                                                                     \
-       unsigned _out = 0;                                              \
-                                                                       \
-       set_flags(_map, _in, _out);                                     \
-       _out;                                                           \
-})
-
-#define map_flags_rev(_map, _in)                                       \
-({                                                                     \
-       unsigned _i, _out = 0;                                          \
-                                                                       \
-       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
-               if ((_in) & _map[_i]) {                                 \
-                       (_out) |= 1 << _i;                              \
-                       (_in) &= ~_map[_i];                             \
-               }                                                       \
-       (_out);                                                         \
-})
-
-#define map_defined(_map)                                              \
-({                                                                     \
-       unsigned _in = ~0;                                              \
-                                                                       \
-       map_flags_rev(_map, _in);                                       \
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
-       set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
 struct flags_set {
        unsigned                mask;
        unsigned                flags;
@@ -95,6 +22,7 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
                                struct bch_inode_unpacked *bi,
                                void *p)
 {
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
        /*
         * We're relying on btree locking here for exclusion with other ioctl
         * calls - use the flags in the btree (@bi), not inode->i_flags:
@@ -107,14 +35,15 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;
 
-       if (!S_ISREG(inode->v.i_mode) &&
-           !S_ISDIR(inode->v.i_mode) &&
+       if (!S_ISREG(bi->bi_mode) &&
+           !S_ISDIR(bi->bi_mode) &&
            (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
                return -EINVAL;
 
        bi->bi_flags &= ~s->mask;
        bi->bi_flags |= newflags;
-       inode->v.i_ctime = current_time(&inode->v);
+
+       bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
        return 0;
 }
 
@@ -152,10 +81,8 @@ static int bch2_ioc_setflags(struct bch_fs *c,
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
-
-       if (!ret)
-               bch2_inode_flags_to_vfs(inode);
+       ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
+                              ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
 setflags_out:
@@ -241,9 +168,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c,
        if (ret)
                goto err_unlock;
 
-       ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
-       if (!ret)
-               bch2_inode_flags_to_vfs(inode);
+       ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+                              ATTR_CTIME);
 err_unlock:
        mutex_unlock(&inode->ei_update_lock);
 err:
index c14e583da7ec6ab2cb355afc18463ae44811f245..c7124ed3b620356e19ee84d7889fdfc30fbce170 100644 (file)
@@ -1,7 +1,78 @@
 #ifndef _BCACHEFS_FS_IOCTL_H
 #define _BCACHEFS_FS_IOCTL_H
 
-void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+/* Inode flags: */
+
+/* bcachefs inode flags -> vfs inode flags: */
+static const unsigned bch_flags_to_vfs[] = {
+       [__BCH_INODE_SYNC]      = S_SYNC,
+       [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
+       [__BCH_INODE_APPEND]    = S_APPEND,
+       [__BCH_INODE_NOATIME]   = S_NOATIME,
+};
+
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const unsigned bch_flags_to_uflags[] = {
+       [__BCH_INODE_SYNC]      = FS_SYNC_FL,
+       [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
+       [__BCH_INODE_APPEND]    = FS_APPEND_FL,
+       [__BCH_INODE_NODUMP]    = FS_NODUMP_FL,
+       [__BCH_INODE_NOATIME]   = FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const unsigned bch_flags_to_xflags[] = {
+       [__BCH_INODE_SYNC]      = FS_XFLAG_SYNC,
+       [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
+       [__BCH_INODE_APPEND]    = FS_XFLAG_APPEND,
+       [__BCH_INODE_NODUMP]    = FS_XFLAG_NODUMP,
+       [__BCH_INODE_NOATIME]   = FS_XFLAG_NOATIME,
+       //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
+};
+
+#define set_flags(_map, _in, _out)                                     \
+do {                                                                   \
+       unsigned _i;                                                    \
+                                                                       \
+       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
+               if ((_in) & (1 << _i))                                  \
+                       (_out) |= _map[_i];                             \
+               else                                                    \
+                       (_out) &= ~_map[_i];                            \
+} while (0)
+
+#define map_flags(_map, _in)                                           \
+({                                                                     \
+       unsigned _out = 0;                                              \
+                                                                       \
+       set_flags(_map, _in, _out);                                     \
+       _out;                                                           \
+})
+
+#define map_flags_rev(_map, _in)                                       \
+({                                                                     \
+       unsigned _i, _out = 0;                                          \
+                                                                       \
+       for (_i = 0; _i < ARRAY_SIZE(_map); _i++)                       \
+               if ((_in) & _map[_i]) {                                 \
+                       (_out) |= 1 << _i;                              \
+                       (_in) &= ~_map[_i];                             \
+               }                                                       \
+       (_out);                                                         \
+})
+
+#define map_defined(_map)                                              \
+({                                                                     \
+       unsigned _in = ~0;                                              \
+                                                                       \
+       map_flags_rev(_map, _in);                                       \
+})
+
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
+{
+       set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+}
 
 long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
 long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
index c51a65da0fb38fc79d285913ee32c9149c6815e4..ae875870b78ddf0b638f33a62d671a49d36e516b 100644 (file)
@@ -47,6 +47,30 @@ static void journal_seq_copy(struct bch_inode_info *dst,
        } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
 }
 
+static inline int ptrcmp(void *l, void *r)
+{
+       return (l > r) - (l < r);
+}
+
+#define __bch2_lock_inodes(_lock, ...)                                 \
+do {                                                                   \
+       struct bch_inode_info *a[] = { NULL, __VA_ARGS__ };             \
+       unsigned i;                                                     \
+                                                                       \
+       bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp);                 \
+                                                                       \
+       for (i = ARRAY_SIZE(a) - 1; a[i]; --i)                          \
+               if (a[i] != a[i - 1]) {                                 \
+                       if (_lock)                                      \
+                               mutex_lock_nested(&a[i]->ei_update_lock, i);\
+                       else                                            \
+                               mutex_unlock(&a[i]->ei_update_lock);    \
+               }                                                       \
+} while (0)
+
+#define bch2_lock_inodes(...)  __bch2_lock_inodes(true, __VA_ARGS__)
+#define bch2_unlock_inodes(...)        __bch2_lock_inodes(false, __VA_ARGS__)
+
 /*
  * I_SIZE_DIRTY requires special handling:
  *
@@ -96,6 +120,8 @@ void bch2_inode_update_after_write(struct bch_fs *c,
 
        inode->ei_inode         = *bi;
        inode->ei_qid           = bch_qid(bi);
+
+       bch2_inode_flags_to_vfs(inode);
 }
 
 int __must_check bch2_write_inode_trans(struct btree_trans *trans,
@@ -106,35 +132,22 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
 {
        struct btree_iter *iter;
        struct bkey_inode_buf *inode_p;
-       struct bkey_s_c k;
-       u64 inum = inode->v.i_ino;
        int ret;
 
        lockdep_assert_held(&inode->ei_update_lock);
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inum, 0),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
+                       POS(inode->v.i_ino, 0),
+                       BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        if (IS_ERR(iter))
                return PTR_ERR(iter);
 
-       k = bch2_btree_iter_peek_slot(iter);
-       if ((ret = btree_iter_err(k)))
+       /* The btree node lock is our lock on the inode: */
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
                return ret;
 
-       if (WARN_ONCE(k.k->type != BCH_INODE_FS,
-                     "inode %llu not found when updating", inum))
-               return -ENOENT;
-
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode_u);
-       if (WARN_ONCE(ret,
-                     "error %i unpacking inode %llu", ret, inum))
-               return -ENOENT;
-
-       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size);
-
-       BUG_ON(inode_u->bi_size != inode->ei_inode.bi_size &&
-              !(inode_u->bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-              inode_u->bi_size > i_size_read(&inode->v));
+       *inode_u = inode->ei_inode;
 
        if (set) {
                ret = set(inode, inode_u, p);
@@ -147,14 +160,14 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans,
                return PTR_ERR(inode_p);
 
        bch2_inode_pack(inode_p, inode_u);
-       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
        return 0;
 }
 
-int __must_check __bch2_write_inode(struct bch_fs *c,
-                                   struct bch_inode_info *inode,
-                                   inode_set_fn set,
-                                   void *p, unsigned fields)
+int __must_check bch2_write_inode(struct bch_fs *c,
+                                 struct bch_inode_info *inode,
+                                 inode_set_fn set,
+                                 void *p, unsigned fields)
 {
        struct btree_trans trans;
        struct bch_inode_unpacked inode_u;
@@ -165,7 +178,7 @@ retry:
        bch2_trans_begin(&trans);
 
        ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK|
@@ -235,9 +248,8 @@ static int inode_update_for_create_fn(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_inode_unpacked *new_inode = p;
-       struct timespec now = current_time(&inode->v);
 
-       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
 
        if (S_ISDIR(new_inode->bi_mode))
                bi->bi_nlink++;
@@ -256,6 +268,7 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
        struct bch_inode_unpacked inode_u;
        struct bch_hash_info hash_info;
        struct posix_acl *default_acl = NULL, *acl = NULL;
+       u64 journal_seq = 0;
        int ret;
 
        bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode);
@@ -288,6 +301,9 @@ __bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
                goto err;
        }
 
+       if (!tmpfile)
+               mutex_lock(&dir->ei_update_lock);
+
        bch2_trans_init(&trans, c);
 retry:
        bch2_trans_begin(&trans);
@@ -316,8 +332,8 @@ retry:
                                          inode_update_for_create_fn,
                                          &inode_u)
                 : 0) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
-                                 &inode->ei_journal_seq,
+               bch2_trans_commit(&trans, NULL,
+                                 &journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK);
        if (ret == -EINTR)
@@ -331,9 +347,11 @@ retry:
                bch2_inode_update_after_write(c, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
                journal_seq_copy(dir, inode->ei_journal_seq);
+               mutex_unlock(&dir->ei_update_lock);
        }
 
        bch2_vfs_inode_init(c, inode, &inode_u);
+       journal_seq_copy(inode, journal_seq);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
        set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -369,6 +387,9 @@ out:
        posix_acl_release(acl);
        return inode;
 err_trans:
+       if (!tmpfile)
+               mutex_unlock(&dir->ei_update_lock);
+
        bch2_trans_exit(&trans);
        make_bad_inode(&inode->v);
        iput(&inode->v);
@@ -416,9 +437,8 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode,
                                    void *p)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct timespec now = current_time(&inode->v);
 
-       bi->bi_ctime = timespec_to_bch2_time(c, now);
+       bi->bi_ctime = bch2_current_time(c);
 
        if (bi->bi_flags & BCH_INODE_UNLINKED)
                bi->bi_flags &= ~BCH_INODE_UNLINKED;
@@ -437,8 +457,7 @@ static int __bch2_link(struct bch_fs *c,
        struct bch_inode_unpacked inode_u;
        int ret;
 
-       lockdep_assert_held(&inode->v.i_rwsem);
-
+       mutex_lock(&inode->ei_update_lock);
        bch2_trans_init(&trans, c);
 retry:
        bch2_trans_begin(&trans);
@@ -452,7 +471,7 @@ retry:
                bch2_write_inode_trans(&trans, inode, &inode_u,
                                       inode_update_for_link_fn,
                                       NULL) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK);
@@ -464,6 +483,7 @@ retry:
                bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
 
        bch2_trans_exit(&trans);
+       mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
 
@@ -475,6 +495,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
        struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
        int ret;
 
+       lockdep_assert_held(&inode->v.i_rwsem);
+
        ret = __bch2_link(c, inode, dir, dentry);
        if (unlikely(ret))
                return ret;
@@ -490,9 +512,8 @@ static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_inode_info *unlink_inode = p;
-       struct timespec now = current_time(&inode->v);
 
-       bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now);
+       bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
 
        bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode);
 
@@ -504,9 +525,8 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode,
                                      void *p)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct timespec now = current_time(&inode->v);
 
-       bi->bi_ctime = timespec_to_bch2_time(c, now);
+       bi->bi_ctime = bch2_current_time(c);
        if (bi->bi_nlink)
                bi->bi_nlink--;
        else
@@ -524,6 +544,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        struct btree_trans trans;
        int ret;
 
+       bch2_lock_inodes(dir, inode);
        bch2_trans_init(&trans, c);
 retry:
        bch2_trans_begin(&trans);
@@ -537,7 +558,7 @@ retry:
                bch2_write_inode_trans(&trans, inode, &inode_u,
                                       inode_update_for_unlink_fn,
                                       NULL) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &dir->ei_journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK|
@@ -556,6 +577,7 @@ retry:
                                      ATTR_MTIME);
 err:
        bch2_trans_exit(&trans);
+       bch2_unlock_inodes(dir, inode);
 
        return ret;
 }
@@ -683,8 +705,6 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
 {
        struct bch_fs *c = src_vdir->i_sb->s_fs_info;
        struct rename_info i = {
-               .now            = timespec_to_bch2_time(c,
-                                               current_time(src_vdir)),
                .src_dir        = to_bch_ei(src_vdir),
                .dst_dir        = to_bch_ei(dst_vdir),
                .src_inode      = to_bch_ei(src_dentry->d_inode),
@@ -718,10 +738,15 @@ static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
                        return ret;
        }
 
+       bch2_lock_inodes(i.src_dir,
+                        i.dst_dir,
+                        i.src_inode,
+                        i.dst_inode);
+
        bch2_trans_init(&trans, c);
 retry:
        bch2_trans_begin(&trans);
-       i.now = timespec_to_bch2_time(c, current_time(src_vdir)),
+       i.now = bch2_current_time(c);
 
        ret   = bch2_dirent_rename(&trans,
                                   i.src_dir, &src_dentry->d_name,
@@ -739,7 +764,7 @@ retry:
                 ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u,
                                       inode_update_for_rename_fn, &i)
                 : 0 ) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK);
@@ -758,6 +783,10 @@ retry:
                journal_seq_copy(i.dst_dir, journal_seq);
        }
 
+       journal_seq_copy(i.src_inode, journal_seq);
+       if (i.dst_inode)
+               journal_seq_copy(i.dst_inode, journal_seq);
+
        bch2_inode_update_after_write(c, i.src_inode, &src_inode_u,
                                      ATTR_CTIME);
        if (i.dst_inode)
@@ -765,6 +794,10 @@ retry:
                                              ATTR_CTIME);
 err:
        bch2_trans_exit(&trans);
+       bch2_unlock_inodes(i.src_dir,
+                          i.dst_dir,
+                          i.src_inode,
+                          i.dst_inode);
 
        return ret;
 }
@@ -849,7 +882,7 @@ retry:
                (iattr->ia_valid & ATTR_MODE
                 ? bch2_acl_chmod(&trans, inode, iattr->ia_mode, &acl)
                 : 0) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+               bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOUNLOCK|
@@ -1198,8 +1231,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
        inode->ei_quota_reserved = 0;
        inode->ei_str_hash      = bch2_hash_info_init(c, bi);
 
-       bch2_inode_flags_to_vfs(inode);
-
        inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
        switch (inode->v.i_mode & S_IFMT) {
@@ -1272,8 +1303,8 @@ static int bch2_vfs_write_inode(struct inode *vinode,
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                                ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
+       ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                              ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
        mutex_unlock(&inode->ei_update_lock);
 
        if (c->opts.journal_flush_disabled)
@@ -1312,13 +1343,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
        struct bch_fs *c = sb->s_fs_info;
+       struct bch_fs_usage usage = bch2_fs_usage_read(c);
+       u64 hidden_metadata = usage.buckets[BCH_DATA_SB] +
+               usage.buckets[BCH_DATA_JOURNAL];
+       unsigned shift = sb->s_blocksize_bits - 9;
        u64 fsid;
 
        buf->f_type     = BCACHEFS_STATFS_MAGIC;
        buf->f_bsize    = sb->s_blocksize;
-       buf->f_blocks   = c->capacity >> PAGE_SECTOR_SHIFT;
-       buf->f_bfree    = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)) >>
-                          PAGE_SECTOR_SHIFT;
+       buf->f_blocks   = (c->capacity - hidden_metadata) >> shift;
+       buf->f_bfree    = (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift;
        buf->f_bavail   = buf->f_bfree;
        buf->f_files    = atomic_long_read(&c->nr_inodes);
        buf->f_ffree    = U64_MAX;
index e2fc2706da44b31b8eca0806b9f6b082433acb9d..a434c757e526f35d3893b083fdc77be98432e6cf 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H
 
+#include "inode.h"
 #include "opts.h"
 #include "str_hash.h"
 #include "quota_types.h"
@@ -43,6 +44,11 @@ static inline unsigned nlink_bias(umode_t mode)
        return S_ISDIR(mode) ? 2 : 1;
 }
 
+static inline u64 bch2_current_time(struct bch_fs *c)
+{
+       return timespec_to_bch2_time(c, current_kernel_time64());
+}
+
 struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
@@ -59,10 +65,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *,
                                struct bch_inode_info *,
                                struct bch_inode_unpacked *,
                                inode_set_fn, void *);
-int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
-                                   inode_set_fn, void *, unsigned);
-int __must_check bch2_write_inode(struct bch_fs *,
-                                 struct bch_inode_info *);
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
+                                 inode_set_fn, void *, unsigned);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
index f6035cc7859a2568e1831d4da43fa5a5844df443..b3e247afe8558e07d3e4550f099b8ad1db41b01e 100644 (file)
@@ -72,8 +72,7 @@ static int reattach_inode(struct bch_fs *c,
        bch2_inode_pack(&packed, lostfound_inode);
 
        ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-                              NULL, NULL, NULL,
-                              BTREE_INSERT_NOFAIL);
+                               NULL, NULL, BTREE_INSERT_NOFAIL);
        if (ret) {
                bch_err(c, "error %i reattaching inode %llu while updating lost+found",
                        ret, inum);
@@ -201,7 +200,7 @@ retry:
        }
 
        ret   = bch2_hash_delete_at(&trans, desc, info, iter) ?:
-               bch2_trans_commit(&trans, NULL, NULL, NULL,
+               bch2_trans_commit(&trans, NULL, NULL,
                                  BTREE_INSERT_ATOMIC|
                                  BTREE_INSERT_NOFAIL);
 err:
@@ -289,6 +288,13 @@ fsck_err:
        return ret;
 }
 
+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size)
+{
+       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+                       POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9),
+                       POS(inode_nr + 1, 0), NULL);
+}
+
 /*
  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
  * that i_size an i_sectors are consistent
@@ -319,7 +325,7 @@ static int check_extents(struct bch_fs *c)
                        k.k->type, k.k->p.inode, w.inode.bi_mode)) {
                        bch2_btree_iter_unlock(&iter);
 
-                       ret = bch2_inode_truncate(c, k.k->p.inode, 0, NULL, NULL);
+                       ret = bch2_inode_truncate(c, k.k->p.inode, 0);
                        if (ret)
                                goto err;
                        continue;
@@ -341,10 +347,7 @@ static int check_extents(struct bch_fs *c)
                        bch2_inode_pack(&p, &w.inode);
 
                        ret = bch2_btree_insert(c, BTREE_ID_INODES,
-                                               &p.inode.k_i,
-                                               NULL,
-                                               NULL,
-                                               NULL,
+                                               &p.inode.k_i, NULL, NULL,
                                                BTREE_INSERT_NOFAIL);
                        if (ret) {
                                bch_err(c, "error in fs gc: error %i "
@@ -365,8 +368,7 @@ static int check_extents(struct bch_fs *c)
                        bch2_btree_iter_unlock(&iter);
 
                        ret = bch2_inode_truncate(c, k.k->p.inode,
-                                       round_up(w.inode.bi_size, PAGE_SIZE) >> 9,
-                                       NULL, NULL);
+                                                 w.inode.bi_size);
                        if (ret)
                                goto err;
                        continue;
@@ -397,7 +399,7 @@ static int check_dirents(struct bch_fs *c)
 
        bch2_trans_init(&trans, c);
 
-       BUG_ON(bch2_trans_preload_iters(&trans));
+       bch2_trans_preload_iters(&trans);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS,
                                   POS(BCACHEFS_ROOT_INO, 0), 0);
@@ -507,7 +509,7 @@ static int check_dirents(struct bch_fs *c)
                        bkey_reassemble(&n->k_i, d.s_c);
                        n->v.d_type = mode_to_type(target.bi_mode);
 
-                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                       ret = bch2_btree_insert_at(c, NULL, NULL,
                                        BTREE_INSERT_NOFAIL,
                                        BTREE_INSERT_ENTRY(iter, &n->k_i));
                        kfree(n);
@@ -538,7 +540,7 @@ static int check_xattrs(struct bch_fs *c)
 
        bch2_trans_init(&trans, c);
 
-       BUG_ON(bch2_trans_preload_iters(&trans));
+       bch2_trans_preload_iters(&trans);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS,
                                   POS(BCACHEFS_ROOT_INO, 0), 0);
@@ -601,7 +603,7 @@ create_root:
        bch2_inode_pack(&packed, root_inode);
 
        return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-                                NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+                                NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
 /* Get lost+found, create if it doesn't exist: */
@@ -645,7 +647,7 @@ create_lostfound:
        bch2_inode_pack(&packed, root_inode);
 
        ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
-                               NULL, NULL, NULL, BTREE_INSERT_NOFAIL);
+                               NULL, NULL, BTREE_INSERT_NOFAIL);
        if (ret)
                return ret;
 
@@ -1093,9 +1095,7 @@ static int check_inode(struct bch_fs *c,
                 * just switch units to bytes and that issue goes away
                 */
 
-               ret = bch2_inode_truncate(c, u.bi_inum,
-                               round_up(u.bi_size, PAGE_SIZE) >> 9,
-                               NULL, NULL);
+               ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size);
                if (ret) {
                        bch_err(c, "error in fs gc: error %i "
                                "truncating inode", ret);
@@ -1141,7 +1141,7 @@ static int check_inode(struct bch_fs *c,
 
                bch2_inode_pack(&p, &u);
 
-               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+               ret = bch2_btree_insert_at(c, NULL, NULL,
                                          BTREE_INSERT_NOFAIL,
                                          BTREE_INSERT_ENTRY(iter, &p.inode.k_i));
                if (ret && ret != -EINTR)
index d4139faa341a088a46bd744a8bab634c2fa7eb6b..4841715ca34f6ef9eb62902f9fd638302932d983 100644 (file)
@@ -227,8 +227,8 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
        }
 }
 
-void bch2_inode_to_text(struct bch_fs *c, char *buf,
-                       size_t size, struct bkey_s_c k)
+int bch2_inode_to_text(struct bch_fs *c, char *buf,
+                      size_t size, struct bkey_s_c k)
 {
        char *out = buf, *end = out + size;
        struct bkey_s_c_inode inode;
@@ -248,6 +248,8 @@ void bch2_inode_to_text(struct bch_fs *c, char *buf,
 #undef  BCH_INODE_FIELD
                break;
        }
+
+       return out - buf;
 }
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -255,8 +257,8 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
                     struct bch_inode_unpacked *parent)
 {
        s64 now = timespec_to_bch2_time(c,
-               timespec_trunc(current_kernel_time(),
-                              c->sb.time_precision));
+               timespec64_trunc(current_kernel_time64(),
+                                c->sb.time_precision));
 
        memset(inode_u, 0, sizeof(*inode_u));
 
@@ -347,7 +349,8 @@ again:
                        inode_u->bi_generation  = bkey_generation(k);
 
                        bch2_inode_pack(inode_p, inode_u);
-                       bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+                       bch2_trans_update(trans,
+                               BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i));
                        return 0;
                }
        }
@@ -369,33 +372,14 @@ int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
                        __bch2_inode_create(&trans, inode_u, min, max, hint));
 }
 
-int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
-                       struct extent_insert_hook *hook, u64 *journal_seq)
-{
-       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
-                                      POS(inode_nr, new_size),
-                                      POS(inode_nr + 1, 0),
-                                      ZERO_VERSION, NULL, hook,
-                                      journal_seq);
-}
-
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
 {
        struct btree_iter iter;
        struct bkey_i_inode_generation delete;
+       struct bpos start = POS(inode_nr, 0);
+       struct bpos end = POS(inode_nr + 1, 0);
        int ret;
 
-       ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
-       if (ret < 0)
-               return ret;
-
-       ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
-                                    POS(inode_nr, 0),
-                                    POS(inode_nr + 1, 0),
-                                    ZERO_VERSION, NULL, NULL, NULL);
-       if (ret < 0)
-               return ret;
-
        /*
         * If this was a directory, there shouldn't be any real dirents left -
         * but there could be whiteouts (from hash collisions) that we should
@@ -404,11 +388,13 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
-                                    POS(inode_nr, 0),
-                                    POS(inode_nr + 1, 0),
-                                    ZERO_VERSION, NULL, NULL, NULL);
-       if (ret < 0)
+       ret   = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+                                       start, end, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_XATTRS,
+                                       start, end, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+                                       start, end, NULL);
+       if (ret)
                return ret;
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
@@ -452,7 +438,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
                        delete.v.bi_generation = cpu_to_le32(bi_generation);
                }
 
-               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+               ret = bch2_btree_insert_at(c, NULL, NULL,
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL,
                                BTREE_INSERT_ENTRY(&iter, &delete.k_i));
index a47194ab93e3f1df5679b26d8f97ffbb4c1929ee..93dbdaeb7ecbdbc46f939b75f5bb31168d35eb4e 100644 (file)
@@ -6,7 +6,7 @@
 #include <linux/math64.h>
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_inode_ops (struct bkey_ops) {                \
        .key_invalid    = bch2_inode_invalid,           \
@@ -45,21 +45,19 @@ int __bch2_inode_create(struct btree_trans *,
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
                      u64, u64, u64 *);
 
-int bch2_inode_truncate(struct bch_fs *, u64, u64,
-                      struct extent_insert_hook *, u64 *);
 int bch2_inode_rm(struct bch_fs *, u64);
 
 int bch2_inode_find_by_inum(struct bch_fs *, u64,
                           struct bch_inode_unpacked *);
 
-static inline struct timespec bch2_time_to_timespec(struct bch_fs *c, u64 time)
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
 {
-       return ns_to_timespec(time * c->sb.time_precision + c->sb.time_base_lo);
+       return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
 }
 
-static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
+static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
 {
-       s64 ns = timespec_to_ns(&ts) - c->sb.time_base_lo;
+       s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
 
        if (c->sb.time_precision == 1)
                return ns;
index f26d4041cdda87564e1f9f1cd30b1a1511acac1f..5ca2a2dd83cadcb70c92829c3bff14b256b5fc99 100644 (file)
@@ -285,7 +285,7 @@ int bch2_write_index_default(struct bch_write_op *op)
                             BTREE_ITER_INTENT);
 
        ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
-                                       NULL, op_journal_seq(op),
+                                       op_journal_seq(op),
                                        BTREE_INSERT_NOFAIL|
                                        BTREE_INSERT_USE_RESERVE);
        bch2_btree_iter_unlock(&iter);
@@ -1388,7 +1388,7 @@ retry:
        if (!bch2_extent_narrow_crcs(e, new_crc))
                goto out;
 
-       ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+       ret = bch2_btree_insert_at(c, NULL, NULL,
                                   BTREE_INSERT_ATOMIC|
                                   BTREE_INSERT_NOFAIL|
                                   BTREE_INSERT_NOWAIT,
index b4fe27f8f5ca3fb780d9ffea3c095c67a810841e..634123ebdf1377473304d384da2095aaefa2c66f 100644 (file)
@@ -32,14 +32,8 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
            test_bit(JOURNAL_NEED_WRITE, &j->flags))
                bch2_time_stats_update(j->delay_time,
                                       j->need_write_time);
-#if 0
-       closure_call(&j->io, bch2_journal_write, NULL, NULL);
-#else
-       /* Shut sparse up: */
-       closure_init(&j->io, NULL);
-       set_closure_fn(&j->io, bch2_journal_write, NULL);
-       bch2_journal_write(&j->io);
-#endif
+
+       closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 }
 
 static void journal_pin_new_entry(struct journal *j, int count)
@@ -96,7 +90,7 @@ static enum {
 } journal_buf_switch(struct journal *j, bool need_write_just_set)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct journal_buf *buf;
+       struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
 
@@ -107,8 +101,11 @@ static enum {
                if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
                        return JOURNAL_ENTRY_CLOSED;
 
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
+                       /* this entry will never be written: */
+                       closure_wake_up(&buf->wait);
                        return JOURNAL_ENTRY_ERROR;
+               }
 
                if (new.prev_buf_unwritten)
                        return JOURNAL_ENTRY_INUSE;
@@ -129,7 +126,6 @@ static enum {
 
        clear_bit(JOURNAL_NEED_WRITE, &j->flags);
 
-       buf = &j->buf[old.idx];
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
        j->prev_buf_sectors =
@@ -138,8 +134,26 @@ static enum {
                c->opts.block_size;
        BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
+       /*
+        * We have to set last_seq here, _before_ opening a new journal entry:
+        *
+        * A threads may replace an old pin with a new pin on their current
+        * journal reservation - the expectation being that the journal will
+        * contain either what the old pin protected or what the new pin
+        * protects.
+        *
+        * After the old pin is dropped journal_last_seq() won't include the old
+        * pin, so we can only write the updated last_seq on the entry that
+        * contains whatever the new pin protects.
+        *
+        * Restated, we can _not_ update last_seq for a given entry if there
+        * could be a newer entry open with reservations/pins that have been
+        * taken against it.
+        *
+        * Hence, we want update/set last_seq on the current journal entry right
+        * before we open a new one:
+        */
        bch2_journal_reclaim_fast(j);
-       /* XXX: why set this here, and not in bch2_journal_write()? */
        buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 
        if (journal_entry_empty(buf->data))
@@ -154,13 +168,6 @@ static enum {
        cancel_delayed_work(&j->write_work);
        spin_unlock(&j->lock);
 
-       if (c->bucket_journal_seq > 1 << 14) {
-               c->bucket_journal_seq = 0;
-               bch2_bucket_seq_cleanup(c);
-       }
-
-       c->bucket_journal_seq++;
-
        /* ugh - might be called from __journal_res_get() under wait_event() */
        __set_current_state(TASK_RUNNING);
        bch2_journal_buf_put(j, old.idx, need_write_just_set);
@@ -265,34 +272,41 @@ static int journal_entry_open(struct journal *j)
        return 1;
 }
 
-/*
- * returns true if there's nothing to flush and no journal write still in flight
- */
-static bool journal_flush_write(struct journal *j)
+static bool __journal_entry_close(struct journal *j)
 {
-       bool ret;
-
-       spin_lock(&j->lock);
-       ret = !j->reservations.prev_buf_unwritten;
+       bool set_need_write;
 
        if (!journal_entry_is_open(j)) {
                spin_unlock(&j->lock);
-               return ret;
+               return true;
        }
 
-       set_bit(JOURNAL_NEED_WRITE, &j->flags);
-       if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
-               ret = false;
-       else
+       set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
+       if (set_need_write)
+               j->need_write_time = local_clock();
+
+       switch (journal_buf_switch(j, set_need_write)) {
+       case JOURNAL_ENTRY_INUSE:
                spin_unlock(&j->lock);
-       return ret;
+               return false;
+       default:
+               spin_unlock(&j->lock);
+       case JOURNAL_UNLOCKED:
+               return true;
+       }
+}
+
+static bool journal_entry_close(struct journal *j)
+{
+       spin_lock(&j->lock);
+       return __journal_entry_close(j);
 }
 
 static void journal_write_work(struct work_struct *work)
 {
        struct journal *j = container_of(work, struct journal, write_work.work);
 
-       journal_flush_write(j);
+       journal_entry_close(j);
 }
 
 /*
@@ -462,6 +476,37 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
        return ret;
 }
 
+static int journal_seq_error(struct journal *j, u64 seq)
+{
+       union journal_res_state state = READ_ONCE(j->reservations);
+
+       if (seq == journal_cur_seq(j))
+               return bch2_journal_error(j);
+
+       if (seq + 1 == journal_cur_seq(j) &&
+           !state.prev_buf_unwritten &&
+           seq > j->seq_ondisk)
+               return -EIO;
+
+       return 0;
+}
+
+static inline struct journal_buf *
+journal_seq_to_buf(struct journal *j, u64 seq)
+{
+       /* seq should be for a journal entry that has been opened: */
+       BUG_ON(seq > journal_cur_seq(j));
+       BUG_ON(seq == journal_cur_seq(j) &&
+              j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
+
+       if (seq == journal_cur_seq(j))
+               return journal_cur_buf(j);
+       if (seq + 1 == journal_cur_seq(j) &&
+           j->reservations.prev_buf_unwritten)
+               return journal_prev_buf(j);
+       return NULL;
+}
+
 /**
  * bch2_journal_wait_on_seq - wait for a journal entry to be written
  *
@@ -470,31 +515,22 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
  * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
  * configurable).
  */
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
+                             struct closure *parent)
 {
-       spin_lock(&j->lock);
-
-       BUG_ON(seq > journal_cur_seq(j));
+       struct journal_buf *buf;
 
-       if (bch2_journal_error(j)) {
-               spin_unlock(&j->lock);
-               return;
-       }
+       spin_lock(&j->lock);
 
-       if (seq == journal_cur_seq(j)) {
-               if (!closure_wait(&journal_cur_buf(j)->wait, parent))
-                       BUG();
-       } else if (seq + 1 == journal_cur_seq(j) &&
-                  j->reservations.prev_buf_unwritten) {
-               if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+       if ((buf = journal_seq_to_buf(j, seq))) {
+               if (!closure_wait(&buf->wait, parent))
                        BUG();
 
-               smp_mb();
-
-               /* check if raced with write completion (or failure) */
-               if (!j->reservations.prev_buf_unwritten ||
-                   bch2_journal_error(j))
-                       closure_wake_up(&journal_prev_buf(j)->wait);
+               if (seq == journal_cur_seq(j)) {
+                       smp_mb();
+                       if (bch2_journal_error(j))
+                               closure_wake_up(&buf->wait);
+               }
        }
 
        spin_unlock(&j->lock);
@@ -506,108 +542,35 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
  * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
+                                 struct closure *parent)
 {
        struct journal_buf *buf;
 
        spin_lock(&j->lock);
 
-       BUG_ON(seq > journal_cur_seq(j));
-
-       if (bch2_journal_error(j)) {
-               spin_unlock(&j->lock);
-               return;
-       }
-
-       if (seq == journal_cur_seq(j)) {
-               bool set_need_write = false;
-
-               buf = journal_cur_buf(j);
-
-               if (parent && !closure_wait(&buf->wait, parent))
-                       BUG();
-
-               if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-                       j->need_write_time = local_clock();
-                       set_need_write = true;
-               }
-
-               switch (journal_buf_switch(j, set_need_write)) {
-               case JOURNAL_ENTRY_ERROR:
-                       if (parent)
-                               closure_wake_up(&buf->wait);
-                       break;
-               case JOURNAL_ENTRY_CLOSED:
-                       /*
-                        * Journal entry hasn't been opened yet, but caller
-                        * claims it has something
-                        */
-                       BUG();
-               case JOURNAL_ENTRY_INUSE:
-                       break;
-               case JOURNAL_UNLOCKED:
-                       return;
-               }
-       } else if (parent &&
-                  seq + 1 == journal_cur_seq(j) &&
-                  j->reservations.prev_buf_unwritten) {
-               buf = journal_prev_buf(j);
-
+       if (parent &&
+           (buf = journal_seq_to_buf(j, seq)))
                if (!closure_wait(&buf->wait, parent))
                        BUG();
 
-               smp_mb();
-
-               /* check if raced with write completion (or failure) */
-               if (!j->reservations.prev_buf_unwritten ||
-                   bch2_journal_error(j))
-                       closure_wake_up(&buf->wait);
-       }
-
-       spin_unlock(&j->lock);
+       if (seq == journal_cur_seq(j))
+               __journal_entry_close(j);
+       else
+               spin_unlock(&j->lock);
 }
 
 static int journal_seq_flushed(struct journal *j, u64 seq)
 {
-       struct journal_buf *buf;
-       int ret = 1;
+       int ret;
 
        spin_lock(&j->lock);
-       BUG_ON(seq > journal_cur_seq(j));
-
-       if (seq == journal_cur_seq(j)) {
-               bool set_need_write = false;
+       ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
 
-               ret = 0;
-
-               buf = journal_cur_buf(j);
-
-               if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-                       j->need_write_time = local_clock();
-                       set_need_write = true;
-               }
-
-               switch (journal_buf_switch(j, set_need_write)) {
-               case JOURNAL_ENTRY_ERROR:
-                       ret = -EIO;
-                       break;
-               case JOURNAL_ENTRY_CLOSED:
-                       /*
-                        * Journal entry hasn't been opened yet, but caller
-                        * claims it has something
-                        */
-                       BUG();
-               case JOURNAL_ENTRY_INUSE:
-                       break;
-               case JOURNAL_UNLOCKED:
-                       return 0;
-               }
-       } else if (seq + 1 == journal_cur_seq(j) &&
-                  j->reservations.prev_buf_unwritten) {
-               ret = bch2_journal_error(j);
-       }
-
-       spin_unlock(&j->lock);
+       if (seq == journal_cur_seq(j))
+               __journal_entry_close(j);
+       else
+               spin_unlock(&j->lock);
 
        return ret;
 }
@@ -727,6 +690,10 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        if (!journal_buckets)
                goto err;
 
+       /*
+        * We may be called from the device add path, before the new device has
+        * actually been added to the running filesystem:
+        */
        if (c)
                spin_lock(&c->journal.lock);
 
@@ -743,10 +710,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                long bucket;
 
                if (new_fs) {
-                       percpu_down_read_preempt_disable(&c->usage_lock);
                        bucket = bch2_bucket_alloc_new_fs(ca);
-                       percpu_up_read_preempt_enable(&c->usage_lock);
-
                        if (bucket < 0) {
                                ret = -ENOSPC;
                                goto err;
@@ -765,6 +729,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (c) {
                        percpu_down_read_preempt_disable(&c->usage_lock);
                        spin_lock(&c->journal.lock);
+               } else {
+                       preempt_disable();
                }
 
                __array_insert_item(ja->buckets,                ja->nr, ja->last_idx);
@@ -792,6 +758,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (c) {
                        spin_unlock(&c->journal.lock);
                        percpu_up_read_preempt_enable(&c->usage_lock);
+               } else {
+                       preempt_enable();
                }
 
                if (!new_fs)
@@ -904,13 +872,16 @@ void bch2_fs_journal_stop(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-       wait_event(j->wait, journal_flush_write(j));
+       wait_event(j->wait, journal_entry_close(j));
 
        /* do we need to write another journal entry? */
        if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
            c->btree_roots_dirty)
                bch2_journal_meta(j);
 
+       BUG_ON(journal_entry_is_open(j) ||
+              j->reservations.prev_buf_unwritten);
+
        BUG_ON(!bch2_journal_error(j) &&
               test_bit(JOURNAL_NOT_EMPTY, &j->flags));
 
@@ -920,6 +891,7 @@ void bch2_fs_journal_stop(struct journal *j)
 
 void bch2_fs_journal_start(struct journal *j)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_seq_blacklist *bl;
        u64 blacklist = 0;
 
@@ -941,6 +913,8 @@ void bch2_fs_journal_start(struct journal *j)
        journal_pin_new_entry(j, 1);
        bch2_journal_buf_init(j);
 
+       c->last_bucket_seq_cleanup = journal_cur_seq(j);
+
        spin_unlock(&j->lock);
 
        /*
@@ -1014,6 +988,7 @@ int bch2_fs_journal_init(struct journal *j)
        init_waitqueue_head(&j->wait);
        INIT_DELAYED_WORK(&j->write_work, journal_write_work);
        INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
+       init_waitqueue_head(&j->pin_flush_wait);
        mutex_init(&j->blacklist_lock);
        INIT_LIST_HEAD(&j->seq_blacklist);
        mutex_init(&j->reclaim_lock);
index 8a4e7b2a92ce7cdcea1d8184036639f9bd998dc5..2a70edc28184e38608f67ed5ea3dbbee345069b7 100644 (file)
@@ -901,7 +901,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                                        bch2_disk_reservation_init(c, 0);
 
                                ret = bch2_btree_insert(c, entry->btree_id, k,
-                                                       &disk_res, NULL, NULL,
+                                                       &disk_res, NULL,
                                                        BTREE_INSERT_NOFAIL|
                                                        BTREE_INSERT_JOURNAL_REPLAY);
                        }
@@ -1204,6 +1204,9 @@ static void journal_write_done(struct closure *cl)
        struct bch_devs_list devs =
                bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
        u64 seq = le64_to_cpu(w->data->seq);
+       u64 last_seq = le64_to_cpu(w->data->last_seq);
+
+       bch2_time_stats_update(j->write_time, j->write_start_time);
 
        if (!devs.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
@@ -1212,11 +1215,11 @@ static void journal_write_done(struct closure *cl)
 
        if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
                goto err;
-out:
-       bch2_time_stats_update(j->write_time, j->write_start_time);
 
        spin_lock(&j->lock);
-       j->last_seq_ondisk = seq;
+       j->seq_ondisk           = seq;
+       j->last_seq_ondisk      = last_seq;
+
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = devs;
 
@@ -1228,7 +1231,7 @@ out:
         * bch2_fs_journal_stop():
         */
        mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-
+out:
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
 
@@ -1246,6 +1249,7 @@ out:
 err:
        bch2_fatal_error(c);
        bch2_journal_halt(j);
+       spin_lock(&j->lock);
        goto out;
 }
 
@@ -1385,6 +1389,8 @@ no_io:
        extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
                ptr->offset += sectors;
 
+       bch2_bucket_seq_cleanup(c);
+
        continue_at(cl, journal_write_done, system_highpri_wq);
        return;
 err:
index 394b72bb55187a00ac852e2958dfa083b90cbf18..978aba7207903a4c39622c7640b763c764e648b6 100644 (file)
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static inline u64 journal_pin_seq(struct journal *j,
-                                 struct journal_entry_pin_list *pin_list)
-{
-       return fifo_entry_idx_abs(&j->pin, pin_list);
-}
-
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
-{
-       u64 ret = 0;
-
-       spin_lock(&j->lock);
-       if (journal_pin_active(pin))
-               ret = journal_pin_seq(j, pin->pin_list);
-       spin_unlock(&j->lock);
-
-       return ret;
-}
-
 static inline void __journal_pin_add(struct journal *j,
-                                    struct journal_entry_pin_list *pin_list,
+                                    u64 seq,
                                     struct journal_entry_pin *pin,
                                     journal_pin_flush_fn flush_fn)
 {
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
        BUG_ON(journal_pin_active(pin));
        BUG_ON(!atomic_read(&pin_list->count));
 
        atomic_inc(&pin_list->count);
-       pin->pin_list   = pin_list;
+       pin->seq        = seq;
        pin->flush      = flush_fn;
 
        if (flush_fn)
@@ -57,19 +41,20 @@ void bch2_journal_pin_add(struct journal *j, u64 seq,
                          journal_pin_flush_fn flush_fn)
 {
        spin_lock(&j->lock);
-       __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+       __journal_pin_add(j, seq, pin, flush_fn);
        spin_unlock(&j->lock);
 }
 
 static inline void __journal_pin_drop(struct journal *j,
                                      struct journal_entry_pin *pin)
 {
-       struct journal_entry_pin_list *pin_list = pin->pin_list;
+       struct journal_entry_pin_list *pin_list;
 
        if (!journal_pin_active(pin))
                return;
 
-       pin->pin_list = NULL;
+       pin_list = journal_seq_pin(j, pin->seq);
+       pin->seq = 0;
        list_del_init(&pin->list);
 
        /*
@@ -82,7 +67,7 @@ static inline void __journal_pin_drop(struct journal *j,
 }
 
 void bch2_journal_pin_drop(struct journal *j,
-                         struct journal_entry_pin *pin)
+                          struct journal_entry_pin *pin)
 {
        spin_lock(&j->lock);
        __journal_pin_drop(j, pin);
@@ -98,15 +83,21 @@ void bch2_journal_pin_add_if_older(struct journal *j,
 
        if (journal_pin_active(src_pin) &&
            (!journal_pin_active(pin) ||
-            journal_pin_seq(j, src_pin->pin_list) <
-            journal_pin_seq(j, pin->pin_list))) {
+            src_pin->seq < pin->seq)) {
                __journal_pin_drop(j, pin);
-               __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+               __journal_pin_add(j, src_pin->seq, pin, flush_fn);
        }
 
        spin_unlock(&j->lock);
 }
 
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
+{
+       BUG_ON(journal_pin_active(pin));
+
+       wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
+}
+
 /*
  * Journal reclaim: flush references to open journal entries to reclaim space in
  * the journal
@@ -144,41 +135,42 @@ void bch2_journal_reclaim_fast(struct journal *j)
                journal_wake(j);
 }
 
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+static void journal_pin_mark_flushing(struct journal *j,
+                                     struct journal_entry_pin *pin,
+                                     u64 seq)
 {
-       struct journal_entry_pin_list *pin_list;
-       struct journal_entry_pin *ret;
-       u64 iter;
-
-       /* no need to iterate over empty fifo entries: */
-       bch2_journal_reclaim_fast(j);
+       lockdep_assert_held(&j->reclaim_lock);
 
-       fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
-               if (iter > seq_to_flush)
-                       break;
+       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
+       BUG_ON(j->flush_in_progress);
+       j->flush_in_progress = pin;
+}
 
-               ret = list_first_entry_or_null(&pin_list->list,
-                               struct journal_entry_pin, list);
-               if (ret) {
-                       /* must be list_del_init(), see bch2_journal_pin_drop() */
-                       list_move(&ret->list, &pin_list->flushed);
-                       *seq = iter;
-                       return ret;
-               }
-       }
+static void journal_pin_flush(struct journal *j,
+                             struct journal_entry_pin *pin,
+                             u64 seq)
+{
+       pin->flush(j, pin, seq);
 
-       return NULL;
+       BUG_ON(j->flush_in_progress != pin);
+       j->flush_in_progress = NULL;
+       wake_up(&j->pin_flush_wait);
 }
 
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 {
-       struct journal_entry_pin *ret;
+       struct journal_entry_pin_list *pin_list;
+       struct journal_entry_pin *ret = NULL;
 
-       spin_lock(&j->lock);
-       ret = __journal_get_next_pin(j, seq_to_flush, seq);
-       spin_unlock(&j->lock);
+       /* no need to iterate over empty fifo entries: */
+       bch2_journal_reclaim_fast(j);
+
+       fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
+               if (*seq > seq_to_flush ||
+                   (ret = list_first_entry_or_null(&pin_list->list,
+                               struct journal_entry_pin, list)))
+                       break;
 
        return ret;
 }
@@ -278,15 +270,11 @@ void bch2_journal_reclaim_work(struct work_struct *work)
                spin_unlock(&j->lock);
        }
 
-       if (reclaim_lock_held)
-               mutex_unlock(&j->reclaim_lock);
-
        /* Also flush if the pin fifo is more than half full */
        spin_lock(&j->lock);
        seq_to_flush = max_t(s64, seq_to_flush,
                             (s64) journal_cur_seq(j) -
                             (j->pin.size >> 1));
-       spin_unlock(&j->lock);
 
        /*
         * If it's been longer than j->reclaim_delay_ms since we last flushed,
@@ -298,13 +286,31 @@ void bch2_journal_reclaim_work(struct work_struct *work)
        while ((pin = journal_get_next_pin(j, need_flush
                                           ? U64_MAX
                                           : seq_to_flush, &seq))) {
-               __set_current_state(TASK_RUNNING);
-               pin->flush(j, pin, seq);
-               need_flush = false;
+               if (!reclaim_lock_held) {
+                       spin_unlock(&j->lock);
+                       __set_current_state(TASK_RUNNING);
+                       mutex_lock(&j->reclaim_lock);
+                       reclaim_lock_held = true;
+                       spin_lock(&j->lock);
+                       continue;
+               }
 
+               journal_pin_mark_flushing(j, pin, seq);
+               spin_unlock(&j->lock);
+
+               journal_pin_flush(j, pin, seq);
+
+               need_flush = false;
                j->last_flushed = jiffies;
+
+               spin_lock(&j->lock);
        }
 
+       spin_unlock(&j->lock);
+
+       if (reclaim_lock_held)
+               mutex_unlock(&j->reclaim_lock);
+
        if (!test_bit(BCH_FS_RO, &c->flags))
                queue_delayed_work(system_freezable_wq, &j->reclaim_work,
                                   msecs_to_jiffies(j->reclaim_delay_ms));
@@ -327,11 +333,14 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
         * If journal replay hasn't completed, the unreplayed journal entries
         * hold refs on their corresponding sequence numbers
         */
-       ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+       ret = (*pin = journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
                !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
                journal_last_seq(j) > seq_to_flush ||
                (fifo_used(&j->pin) == 1 &&
                 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+       if (*pin)
+               journal_pin_mark_flushing(j, *pin, *pin_seq);
+
        spin_unlock(&j->lock);
 
        return ret;
@@ -345,14 +354,18 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
        if (!test_bit(JOURNAL_STARTED, &j->flags))
                return;
 
+       mutex_lock(&j->reclaim_lock);
+
        while (1) {
                wait_event(j->wait, journal_flush_done(j, seq_to_flush,
                                                       &pin, &pin_seq));
                if (!pin)
                        break;
 
-               pin->flush(j, pin, pin_seq);
+               journal_pin_flush(j, pin, pin_seq);
        }
+
+       mutex_unlock(&j->reclaim_lock);
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
index eb22790251decd34df5e186ac0b88c4d4165efb4..f7dcbfd398d578334fea137ecba9d0e8a1dc8420 100644 (file)
@@ -5,19 +5,17 @@
 
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
-       return pin->pin_list != NULL;
+       return pin->seq != 0;
 }
 
 static inline struct journal_entry_pin_list *
 journal_seq_pin(struct journal *j, u64 seq)
 {
-       BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+       EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
 
        return &j->pin.data[seq & j->pin.mask];
 }
 
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
                          journal_pin_flush_fn);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
@@ -25,6 +23,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
                                  struct journal_entry_pin *,
                                  struct journal_entry_pin *,
                                  journal_pin_flush_fn);
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
 void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
index effbeece1ed98c8822d7bcc593071891db9e9e13..26702482b85a6f88613b3de803cee2ef80339706 100644 (file)
@@ -47,7 +47,7 @@ typedef void (*journal_pin_flush_fn)(struct journal *j,
 struct journal_entry_pin {
        struct list_head                list;
        journal_pin_flush_fn            flush;
-       struct journal_entry_pin_list   *pin_list;
+       u64                             seq;
 };
 
 /* corresponds to a btree node with a blacklisted bset: */
@@ -150,7 +150,8 @@ struct journal {
        /* Sequence number of most recent journal entry (last entry in @pin) */
        atomic64_t              seq;
 
-       /* last_seq from the most recent journal entry written */
+       /* seq, last_seq from the most recent journal entry successfully written */
+       u64                     seq_ondisk;
        u64                     last_seq_ondisk;
 
        /*
@@ -173,6 +174,10 @@ struct journal {
                u64 front, back, size, mask;
                struct journal_entry_pin_list *data;
        }                       pin;
+
+       struct journal_entry_pin *flush_in_progress;
+       wait_queue_head_t       pin_flush_wait;
+
        u64                     replay_journal_seq;
 
        struct mutex            blacklist_lock;
index 215c5aa5be0ecc59e0ab50340b35e5eefa5777e7..f5cbf44d7b8c21b7a1a5b1536a364375dffde9ea 100644 (file)
@@ -78,7 +78,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
                iter.pos = bkey_start_pos(&tmp.key.k);
 
-               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+               ret = bch2_btree_insert_at(c, NULL, NULL,
                                           BTREE_INSERT_ATOMIC|
                                           BTREE_INSERT_NOFAIL,
                                           BTREE_INSERT_ENTRY(&iter, &tmp.key));
index 3e52b7a26c7f51e5e3c37ec73da41885fa93fcb9..4a5e435bfe4b1bffee773c1dc1005d449a6ac403 100644 (file)
@@ -158,7 +158,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                        break;
 
                ret = bch2_btree_insert_at(c, &op->res,
-                               NULL, op_journal_seq(op),
+                               op_journal_seq(op),
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL|
                                BTREE_INSERT_USE_RESERVE|
index 7bef456110f1e1132ce812e2a15e3c8cc8940e85..d414ee94cc2c3677da2468c512f4871911c25a30 100644 (file)
@@ -227,16 +227,10 @@ static int bch2_copygc_thread(void *arg)
 
                last = atomic_long_read(&clock->now);
 
-               reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-                                ca->mi.bucket_size *
-                                c->opts.gc_reserve_percent, 200);
+               reserve = ca->copygc_threshold;
 
                usage = bch2_dev_usage_read(c, ca);
 
-               /*
-                * don't start copygc until less than half the gc reserve is
-                * available:
-                */
                available = __dev_buckets_available(ca, usage) *
                        ca->mi.bucket_size;
                if (available > reserve) {
index f476033e707f3b2cf74345fd53f8ccfb593c5a58..79b16fe73da600149ce81bbd39bfb9702aad915f 100644 (file)
@@ -113,9 +113,12 @@ enum opt_type {
        BCH_OPT(inodes_32bit,           u8,     OPT_RUNTIME,            \
                OPT_BOOL(),                                             \
                BCH_SB_INODE_32BIT,             false)                  \
-       BCH_OPT(gc_reserve_percent,     u8,     OPT_MOUNT,              \
+       BCH_OPT(gc_reserve_percent,     u8,     OPT_RUNTIME,            \
                OPT_UINT(5, 21),                                        \
                BCH_SB_GC_RESERVE,              8)                      \
+       BCH_OPT(gc_reserve_bytes,       u64,    OPT_RUNTIME,            \
+               OPT_UINT(0, U64_MAX),                                   \
+               BCH_SB_GC_RESERVE_BYTES,        0)                      \
        BCH_OPT(root_reserve_percent,   u8,     OPT_MOUNT,              \
                OPT_UINT(0, 100),                                       \
                BCH_SB_ROOT_RESERVE,            0)                      \
index bb03d83a53e4901561979df644309d1694dcdc88..e16045815822a867120ccd2ad8800b9ee01cd7d1 100644 (file)
@@ -45,10 +45,10 @@ static const char * const bch2_quota_counters[] = {
        "inodes",
 };
 
-void bch2_quota_to_text(struct bch_fs *c, char *buf,
-                       size_t size, struct bkey_s_c k)
+int bch2_quota_to_text(struct bch_fs *c, char *buf,
+                      size_t size, struct bkey_s_c k)
 {
-       char *out = buf, *end= buf + size;
+       char *out = buf, *end = buf + size;
        struct bkey_s_c_quota dq;
        unsigned i;
 
@@ -63,6 +63,8 @@ void bch2_quota_to_text(struct bch_fs *c, char *buf,
                                         le64_to_cpu(dq.v->c[i].softlimit));
                break;
        }
+
+       return out - buf;
 }
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -538,7 +540,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
                                              POS(QTYP_USR, 0),
                                              POS(QTYP_USR + 1, 0),
-                                             ZERO_VERSION, NULL, NULL, NULL);
+                                             NULL);
                if (ret)
                        return ret;
        }
@@ -550,7 +552,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
                                              POS(QTYP_GRP, 0),
                                              POS(QTYP_GRP + 1, 0),
-                                             ZERO_VERSION, NULL, NULL, NULL);
+                                             NULL);
                if (ret)
                        return ret;
        }
@@ -562,7 +564,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
                                              POS(QTYP_PRJ, 0),
                                              POS(QTYP_PRJ + 1, 0),
-                                             ZERO_VERSION, NULL, NULL, NULL);
+                                             NULL);
                if (ret)
                        return ret;
        }
@@ -761,7 +763,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
        if (qdq->d_fieldmask & QC_INO_HARD)
                new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+       ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                   BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
        bch2_btree_iter_unlock(&iter);
 
index 0b24f22cf4fbeac6224f6859c073f85bd9225bbf..14570c8b277920ae5dcf1a098f29f13ba22dc488 100644 (file)
@@ -7,7 +7,7 @@
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
 const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_quota_ops (struct bkey_ops) {                \
        .key_invalid    = bch2_quota_invalid,           \
index 0af136d674c42e8328246757c6ed924c7d476a1d..3a20a77474f11106ad05a429de122ac1753b30a6 100644 (file)
@@ -330,7 +330,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        err = "error creating root directory";
        ret = bch2_btree_insert(c, BTREE_ID_INODES,
                                &packed_inode.inode.k_i,
-                               NULL, NULL, NULL, 0);
+                               NULL, NULL, 0);
        if (ret)
                goto err;
 
@@ -343,7 +343,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        err = "error creating lost+found";
        ret = bch2_btree_insert(c, BTREE_ID_INODES,
                                &packed_inode.inode.k_i,
-                               NULL, NULL, NULL, 0);
+                               NULL, NULL, 0);
        if (ret)
                goto err;
 
index 99f1fe87329987100481a483b28944fe52e524d4..7eff5a42d91e6fe9e27fc837032bf5ff01ba9040 100644 (file)
@@ -254,14 +254,14 @@ not_found:
                return -ENOENT;
 
        insert->k.p = slot->pos;
-       bch2_trans_update(trans, slot, insert, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(slot, insert));
        return 0;
 found:
        if (flags & BCH_HASH_SET_MUST_CREATE)
                return -EEXIST;
 
        insert->k.p = iter->pos;
-       bch2_trans_update(trans, iter, insert, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert));
        return 0;
 }
 
@@ -296,7 +296,7 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
        delete->k.p = iter->pos;
        delete->k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
 
-       bch2_trans_update(trans, iter, delete, 0);
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete));
        return 0;
 }
 
index a2a32b924434b15182d616b9eded5b012dca9725..f4cf44a03394bcf83aded948e349e68de371980f 100644 (file)
@@ -403,6 +403,7 @@ static void bch2_fs_free(struct bch_fs *c)
        bch2_fs_compress_exit(c);
        percpu_free_rwsem(&c->usage_lock);
        free_percpu(c->usage_percpu);
+       mempool_exit(&c->btree_iters_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
        mempool_exit(&c->btree_interior_update_pool);
@@ -435,6 +436,8 @@ void bch2_fs_stop(struct bch_fs *c)
        struct bch_dev *ca;
        unsigned i;
 
+       bch_verbose(c, "shutting down");
+
        for_each_member_device(ca, c, i)
                if (ca->kobj.state_in_sysfs &&
                    ca->disk_sb.bdev)
@@ -476,6 +479,8 @@ void bch2_fs_stop(struct bch_fs *c)
                if (c->devs[i])
                        bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
 
+       bch_verbose(c, "shutdown complete");
+
        kobject_put(&c->kobj);
 }
 
@@ -628,6 +633,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            percpu_init_rwsem(&c->usage_lock) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
+           mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
+                       sizeof(struct btree_iter) * BTREE_ITER_MAX) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
@@ -1019,14 +1026,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
                ca->disk_sb.bdev->bd_holder = ca;
        memset(sb, 0, sizeof(*sb));
 
-       if (ca->fs)
-               mutex_lock(&ca->fs->sb_lock);
-
-       bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-       if (ca->fs)
-               mutex_unlock(&ca->fs->sb_lock);
-
        percpu_ref_reinit(&ca->io_ref);
 
        return 0;
@@ -1052,6 +1051,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
        if (ret)
                return ret;
 
+       mutex_lock(&c->sb_lock);
+       bch2_mark_dev_superblock(ca->fs, ca,
+                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       mutex_unlock(&c->sb_lock);
+
        bch2_dev_sysfs_online(c, ca);
 
        if (c->sb.nr_devices == 1)
@@ -1280,8 +1284,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
        ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
                                      POS(ca->dev_idx, 0),
                                      POS(ca->dev_idx + 1, 0),
-                                     ZERO_VERSION,
-                                     NULL, NULL, NULL);
+                                     NULL);
        if (ret) {
                bch_err(ca, "Remove failed, error deleting alloc info");
                goto err;
@@ -1329,6 +1332,24 @@ err:
        return ret;
 }
 
+static void dev_usage_clear(struct bch_dev *ca)
+{
+       struct bucket_array *buckets;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct bch_dev_usage *p =
+                       per_cpu_ptr(ca->usage_percpu, cpu);
+               memset(p, 0, sizeof(*p));
+       }
+
+       down_read(&ca->bucket_lock);
+       buckets = bucket_array(ca);
+
+       memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
+       up_read(&ca->bucket_lock);
+}
+
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1367,11 +1388,28 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                return ret;
        }
 
+       /*
+        * We want to allocate journal on the new device before adding the new
+        * device to the filesystem because allocating after we attach requires
+        * spinning up the allocator thread, and the allocator thread requires
+        * doing btree writes, which if the existing devices are RO isn't going
+        * to work
+        *
+        * So we have to mark where the superblocks are, but marking allocated
+        * data normally updates the filesystem usage too, so we have to mark,
+        * allocate the journal, reset all the marks, then remark after we
+        * attach...
+        */
+       bch2_mark_dev_superblock(ca->fs, ca,
+                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
        err = "journal alloc failed";
        ret = bch2_dev_journal_alloc(ca);
        if (ret)
                goto err;
 
+       dev_usage_clear(ca);
+
        mutex_lock(&c->state_lock);
        mutex_lock(&c->sb_lock);
 
@@ -1422,6 +1460,9 @@ have_slot:
        ca->disk_sb.sb->dev_idx = dev_idx;
        bch2_dev_attach(c, ca, dev_idx);
 
+       bch2_mark_dev_superblock(c, ca,
+                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
index 4987ee76a08cbdeb4153b04990941643bda39e0d..b353d7cdb6cdb884544bded4a427185406eeb298 100644 (file)
@@ -229,41 +229,42 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
+       char *out = buf, *end = buf + PAGE_SIZE;
        struct bch_fs_usage stats = bch2_fs_usage_read(c);
+       unsigned replicas, type;
+
+       out += scnprintf(out, end - out,
+                        "capacity:\t\t%llu\n",
+                        c->capacity);
+
+       for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
+               out += scnprintf(out, end - out,
+                                "%u replicas:\n",
+                                replicas + 1);
+
+               for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
+                       out += scnprintf(out, end - out,
+                                        "\t%s:\t\t%llu\n",
+                                        bch2_data_types[type],
+                                        stats.replicas[replicas].data[type]);
+               out += scnprintf(out, end - out,
+                                "\treserved:\t%llu\n",
+                                stats.replicas[replicas].persistent_reserved);
+       }
 
-       return scnprintf(buf, PAGE_SIZE,
-                        "capacity:\t\t%llu\n"
-                        "1 replicas:\n"
-                        "\tmeta:\t\t%llu\n"
-                        "\tdirty:\t\t%llu\n"
-                        "\treserved:\t%llu\n"
-                        "2 replicas:\n"
-                        "\tmeta:\t\t%llu\n"
-                        "\tdirty:\t\t%llu\n"
-                        "\treserved:\t%llu\n"
-                        "3 replicas:\n"
-                        "\tmeta:\t\t%llu\n"
-                        "\tdirty:\t\t%llu\n"
-                        "\treserved:\t%llu\n"
-                        "4 replicas:\n"
-                        "\tmeta:\t\t%llu\n"
-                        "\tdirty:\t\t%llu\n"
-                        "\treserved:\t%llu\n"
+       out += scnprintf(out, end - out, "bucket usage\n");
+
+       for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
+               out += scnprintf(out, end - out,
+                                "\t%s:\t\t%llu\n",
+                                bch2_data_types[type],
+                                stats.buckets[type]);
+
+       out += scnprintf(out, end - out,
                         "online reserved:\t%llu\n",
-                        c->capacity,
-                        stats.s[0].data[S_META],
-                        stats.s[0].data[S_DIRTY],
-                        stats.s[0].persistent_reserved,
-                        stats.s[1].data[S_META],
-                        stats.s[1].data[S_DIRTY],
-                        stats.s[1].persistent_reserved,
-                        stats.s[2].data[S_META],
-                        stats.s[2].data[S_DIRTY],
-                        stats.s[2].persistent_reserved,
-                        stats.s[3].data[S_META],
-                        stats.s[3].data[S_DIRTY],
-                        stats.s[3].persistent_reserved,
                         stats.online_reserved);
+
+       return out - buf;
 }
 
 static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
@@ -779,13 +780,15 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                "    meta:               %llu\n"
                "    user:               %llu\n"
                "    cached:             %llu\n"
-               "    available:          %llu\n"
+               "    available:          %lli\n"
                "sectors:\n"
                "    sb:                 %llu\n"
                "    journal:            %llu\n"
                "    meta:               %llu\n"
                "    user:               %llu\n"
                "    cached:             %llu\n"
+               "    fragmented:         %llu\n"
+               "    copygc threshold:   %llu\n"
                "freelist_wait:          %s\n"
                "open buckets:           %u/%u (reserved %u)\n"
                "open_buckets_wait:      %s\n",
@@ -800,12 +803,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                stats.buckets[BCH_DATA_BTREE],
                stats.buckets[BCH_DATA_USER],
                stats.buckets[BCH_DATA_CACHED],
-               __dev_buckets_available(ca, stats),
+               ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
                stats.sectors[BCH_DATA_SB],
                stats.sectors[BCH_DATA_JOURNAL],
                stats.sectors[BCH_DATA_BTREE],
                stats.sectors[BCH_DATA_USER],
                stats.sectors[BCH_DATA_CACHED],
+               stats.sectors_fragmented,
+               ca->copygc_threshold,
                c->freelist_wait.list.first             ? "waiting" : "empty",
                c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
                c->open_buckets_wait.list.first         ? "waiting" : "empty");
index 31847a94a2c70fdc205b65800dfe3de47eb13151..f06eb2d8425f4131f8d455f0e38bd0b111426d15 100644 (file)
@@ -14,12 +14,12 @@ static void delete_test_keys(struct bch_fs *c)
 
        ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
                                      POS(0, 0), POS(0, U64_MAX),
-                                     ZERO_VERSION, NULL, NULL, NULL);
+                                     NULL);
        BUG_ON(ret);
 
        ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
                                      POS(0, 0), POS(0, U64_MAX),
-                                     ZERO_VERSION, NULL, NULL, NULL);
+                                     NULL);
        BUG_ON(ret);
 }
 
@@ -39,7 +39,7 @@ static void test_delete(struct bch_fs *c, u64 nr)
        ret = bch2_btree_iter_traverse(&iter);
        BUG_ON(ret);
 
-       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+       ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                   BTREE_INSERT_ENTRY(&iter, &k.k_i));
        BUG_ON(ret);
 
@@ -68,7 +68,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr)
        ret = bch2_btree_iter_traverse(&iter);
        BUG_ON(ret);
 
-       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+       ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                   BTREE_INSERT_ENTRY(&iter, &k.k_i));
        BUG_ON(ret);
 
@@ -98,7 +98,7 @@ static void test_iterate(struct bch_fs *c, u64 nr)
                k.k.p.offset = i;
 
                ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 
@@ -140,7 +140,7 @@ static void test_iterate_extents(struct bch_fs *c, u64 nr)
                k.k.size = 8;
 
                ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 
@@ -185,7 +185,7 @@ static void test_iterate_slots(struct bch_fs *c, u64 nr)
                k.k.p.offset = i * 2;
 
                ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 
@@ -235,7 +235,7 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                k.k.size = 8;
 
                ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 
@@ -270,6 +270,63 @@ static void test_iterate_slots_extents(struct bch_fs *c, u64 nr)
        bch2_btree_iter_unlock(&iter);
 }
 
+/* extent unit tests */
+
+u64 test_version;
+
+static void insert_test_extent(struct bch_fs *c,
+                              u64 start, u64 end)
+{
+       struct bkey_i_cookie k;
+       int ret;
+
+       //pr_info("inserting %llu-%llu v %llu", start, end, test_version);
+
+       bkey_cookie_init(&k.k_i);
+       k.k_i.k.p.offset = end;
+       k.k_i.k.size = end - start;
+       k.k_i.k.version.lo = test_version++;
+
+       ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i,
+                               NULL, NULL, 0);
+       BUG_ON(ret);
+}
+
+static void __test_extent_overwrite(struct bch_fs *c,
+                                   u64 e1_start, u64 e1_end,
+                                   u64 e2_start, u64 e2_end)
+{
+       insert_test_extent(c, e1_start, e1_end);
+       insert_test_extent(c, e2_start, e2_end);
+
+       delete_test_keys(c);
+}
+
+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr)
+{
+       __test_extent_overwrite(c, 0, 64, 0, 32);
+       __test_extent_overwrite(c, 8, 64, 0, 32);
+}
+
+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr)
+{
+       __test_extent_overwrite(c, 0, 64, 32, 64);
+       __test_extent_overwrite(c, 0, 64, 32, 72);
+}
+
+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
+{
+       __test_extent_overwrite(c, 0, 64, 32, 40);
+}
+
+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr)
+{
+       __test_extent_overwrite(c, 32, 64,  0,  64);
+       __test_extent_overwrite(c, 32, 64,  0, 128);
+       __test_extent_overwrite(c, 32, 64, 32,  64);
+       __test_extent_overwrite(c, 32, 64, 32, 128);
+}
+
 /* perf tests */
 
 static u64 test_rand(void)
@@ -294,7 +351,7 @@ static void rand_insert(struct bch_fs *c, u64 nr)
                k.k.p.offset = test_rand();
 
                ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 }
@@ -335,7 +392,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr)
                        bkey_cookie_init(&k.k_i);
                        k.k.p = iter.pos;
 
-                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                       ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                                   BTREE_INSERT_ENTRY(&iter, &k.k_i));
                        BUG_ON(ret);
                }
@@ -356,7 +413,7 @@ static void rand_delete(struct bch_fs *c, u64 nr)
                k.k.p.offset = test_rand();
 
                ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
-                                       NULL, NULL, NULL, 0);
+                                       NULL, NULL, 0);
                BUG_ON(ret);
        }
 }
@@ -375,7 +432,7 @@ static void seq_insert(struct bch_fs *c, u64 nr)
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
                insert.k.p = iter.pos;
 
-               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+               ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                BTREE_INSERT_ENTRY(&iter, &insert.k_i));
                BUG_ON(ret);
 
@@ -407,7 +464,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr)
 
                bkey_reassemble(&u.k_i, k);
 
-               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+               ret = bch2_btree_insert_at(c, NULL, NULL, 0,
                                           BTREE_INSERT_ENTRY(&iter, &u.k_i));
                BUG_ON(ret);
        }
@@ -420,7 +477,7 @@ static void seq_delete(struct bch_fs *c, u64 nr)
 
        ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
                                      POS(0, 0), POS(0, U64_MAX),
-                                     ZERO_VERSION, NULL, NULL, NULL);
+                                     NULL);
        BUG_ON(ret);
 }
 
@@ -498,6 +555,11 @@ void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
        perf_test(test_iterate_slots);
        perf_test(test_iterate_slots_extents);
 
+       perf_test(test_extent_overwrite_front);
+       perf_test(test_extent_overwrite_back);
+       perf_test(test_extent_overwrite_middle);
+       perf_test(test_extent_overwrite_all);
+
        if (!j.fn) {
                pr_err("unknown test %s", testname);
                return;
index 7d0fee3a8c0495994aa6cbb6da0f4f7e348daeef..398bc534cc3c8b230e1607cd8467232132a10e6a 100644 (file)
@@ -110,12 +110,12 @@ const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
        }
 }
 
-void bch2_xattr_to_text(struct bch_fs *c, char *buf,
-                       size_t size, struct bkey_s_c k)
+int bch2_xattr_to_text(struct bch_fs *c, char *buf,
+                      size_t size, struct bkey_s_c k)
 {
+       char *out = buf, *end = buf + size;
        const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr;
-       size_t n = 0;
 
        switch (k.k->type) {
        case BCH_XATTR:
@@ -123,24 +123,26 @@ void bch2_xattr_to_text(struct bch_fs *c, char *buf,
 
                handler = bch2_xattr_type_to_handler(xattr.v->x_type);
                if (handler && handler->prefix)
-                       n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+                       out += scnprintf(out, end - out, "%s", handler->prefix);
                else if (handler)
-                       n += scnprintf(buf + n, size - n, "(type %u)",
-                                      xattr.v->x_type);
+                       out += scnprintf(out, end - out, "(type %u)",
+                                        xattr.v->x_type);
                else
-                       n += scnprintf(buf + n, size - n, "(unknown type %u)",
-                                      xattr.v->x_type);
-
-               n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
-                                  xattr.v->x_name_len);
-               n += scnprintf(buf + n, size - n, ":");
-               n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
-                                  le16_to_cpu(xattr.v->x_val_len));
+                       out += scnprintf(out, end - out, "(unknown type %u)",
+                                        xattr.v->x_type);
+
+               out += bch_scnmemcpy(out, end - out, xattr.v->x_name,
+                                    xattr.v->x_name_len);
+               out += scnprintf(out, end - out, ":");
+               out += bch_scnmemcpy(out, end - out, xattr_val(xattr.v),
+                                    le16_to_cpu(xattr.v->x_val_len));
                break;
        case BCH_XATTR_WHITEOUT:
-               scnprintf(buf, size, "whiteout");
+               out += scnprintf(out, end - out, "whiteout");
                break;
        }
+
+       return out - buf;
 }
 
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
@@ -433,7 +435,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
+       ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
        mutex_unlock(&inode->ei_update_lock);
 
        if (value &&
index 0689d327cdc4ea91b9969f8f847bcf746c318223..cd1e7ad3e20a7bc1afe2078723063dcff5a503f9 100644 (file)
@@ -6,7 +6,7 @@
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
 const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+int bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
 
 #define bch2_bkey_xattr_ops (struct bkey_ops) {                \
        .key_invalid    = bch2_xattr_invalid,           \