]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to ece184f718 bcachefs: Reflink
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 21 Aug 2019 17:17:42 +0000 (13:17 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Wed, 21 Aug 2019 17:19:36 +0000 (13:19 -0400)
36 files changed:
.bcachefs_revision
include/linux/sched/signal.h [new file with mode: 0644]
libbcachefs/alloc_background.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_gc.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/reflink.c [new file with mode: 0644]
libbcachefs/reflink.h [new file with mode: 0644]
libbcachefs/replicas.c

index 31858ae614b1194f23be9cc76faf5ea1d3d330ef..fd1cda2aafa1b6861c61c7fc3666bb85ce42a79d 100644 (file)
@@ -1 +1 @@
-22776fe9902b0b06d6aa18cd4c7f0c5ad35a95fa
+ece184f718c2b678738bc2c42906e90eeb8ba7dc
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
new file mode 100644 (file)
index 0000000..e69de29
index 43dc2f270dc600b7bb4b3e5444729dad744629a0..4cf728cea393fe5002370c381635041731d83a7e 100644 (file)
@@ -232,7 +232,7 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
-               bch2_mark_key(c, k, 0, NULL, 0,
+               bch2_mark_key(c, k, 0, 0, NULL, 0,
                              BCH_BUCKET_MARK_ALLOC_READ|
                              BCH_BUCKET_MARK_NOATOMIC);
 
@@ -244,7 +244,8 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 
        for_each_journal_key(*journal_keys, j)
                if (j->btree_id == BTREE_ID_ALLOC)
-                       bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0,
+                       bch2_mark_key(c, bkey_i_to_s_c(j->k),
+                                     0, 0, NULL, 0,
                                      BCH_BUCKET_MARK_ALLOC_READ|
                                      BCH_BUCKET_MARK_NOATOMIC);
 
index 907d1b605cf4a7f240b4617a92495ebdab0575f8..1e601e7b3de51a483ce859955efe9f0b910be328 100644 (file)
@@ -359,6 +359,7 @@ enum gc_phase {
        GC_PHASE_BTREE_XATTRS,
        GC_PHASE_BTREE_ALLOC,
        GC_PHASE_BTREE_QUOTAS,
+       GC_PHASE_BTREE_REFLINK,
 
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
@@ -746,6 +747,9 @@ struct bch_fs {
        struct work_struct      ec_stripe_delete_work;
        struct llist_head       ec_stripe_delete_list;
 
+       /* REFLINK */
+       u64                     reflink_hint;
+
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
        struct bio_set          dio_write_bioset;
index 13285936dd2dc02ad29e405116f971c0a379473c..667170b54ce6769d0d748d14ad2d12856895a6a4 100644 (file)
@@ -336,7 +336,9 @@ static inline void bkey_init(struct bkey *k)
        x(xattr,                11)                     \
        x(alloc,                12)                     \
        x(quota,                13)                     \
-       x(stripe,               14)
+       x(stripe,               14)                     \
+       x(reflink_p,            15)                     \
+       x(reflink_v,            16)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -891,6 +893,24 @@ struct bch_stripe {
        struct bch_extent_ptr   ptrs[0];
 } __attribute__((packed, aligned(8)));
 
+/* Reflink: */
+
+struct bch_reflink_p {
+       struct bch_val          v;
+       __le64                  idx;
+
+       __le32                  reservation_generation;
+       __u8                    nr_replicas;
+       __u8                    pad[3];
+};
+
+struct bch_reflink_v {
+       struct bch_val          v;
+       __le64                  refcount;
+       union bch_extent_entry  start[0];
+       __u64                   _data[0];
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1293,6 +1313,7 @@ enum bch_sb_features {
        BCH_FEATURE_ATOMIC_NLINK        = 3, /* should have gone under compat */
        BCH_FEATURE_EC                  = 4,
        BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
+       BCH_FEATURE_REFLINK             = 6,
        BCH_FEATURE_NR,
 };
 
@@ -1480,7 +1501,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,     struct jset, flags, 4, 5);
        x(XATTRS,       3, "xattrs")                    \
        x(ALLOC,        4, "alloc")                     \
        x(QUOTAS,       5, "quotas")                    \
-       x(EC,           6, "erasure_coding")
+       x(EC,           6, "erasure_coding")            \
+       x(REFLINK,      7, "reflink")
 
 enum btree_id {
 #define x(kwd, val, name) BTREE_ID_##kwd = val,
index 1acff9d0fd7efadec12883f12185a77dfea042d7..5ef66aed338d9f9a8c65d40a54378a0720a60870 100644 (file)
@@ -50,7 +50,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
        k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-#define bkey_val_end(_k)       vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+#define bkey_val_end(_k)       ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
 
 #define bkey_deleted(_k)       ((_k)->type == KEY_TYPE_deleted)
 
@@ -552,6 +552,8 @@ BKEY_VAL_ACCESSORS(xattr);
 BKEY_VAL_ACCESSORS(alloc);
 BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
+BKEY_VAL_ACCESSORS(reflink_p);
+BKEY_VAL_ACCESSORS(reflink_v);
 
 /* byte order helpers */
 
index 27f196ef0b186963df59c6523479db6593e11eee..6fa6ac1fadc13494c826c0175a5588220558862f 100644 (file)
 #include "extents.h"
 #include "inode.h"
 #include "quota.h"
+#include "reflink.h"
 #include "xattr.h"
 
-const char * const bch_bkey_types[] = {
+const char * const bch2_bkey_types[] = {
 #define x(name, nr) #name,
        BCH_BKEY_TYPES()
 #undef x
@@ -159,7 +160,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 
 void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
 {
-       pr_buf(out, "u64s %u type %u ", k->u64s, k->type);
+       pr_buf(out, "u64s %u type %s ", k->u64s,
+              bch2_bkey_types[k->type]);
 
        bch2_bpos_to_text(out, k->p);
 
@@ -174,8 +176,6 @@ void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
 
        if (likely(ops->val_to_text))
                ops->val_to_text(out, c, k);
-       else
-               pr_buf(out, " %s", bch_bkey_types[k.k->type]);
 }
 
 void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
index 08b976633360583176961ef2e118a210d2b4cd24..e6e97cda4f501154c69fe09654a830391bf9a48b 100644 (file)
@@ -9,7 +9,7 @@ struct btree;
 struct bkey;
 enum btree_node_type;
 
-extern const char * const bch_bkey_types[];
+extern const char * const bch2_bkey_types[];
 
 enum merge_result {
        BCH_MERGE_NOMERGE,
index ef10e77ec1e510aedec44fd5eb935a1d8d169ba0..32436ed5cc803c3c900cbf4e14c68a976351883b 100644 (file)
 static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
                                                  struct btree *);
 
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+       unsigned n = ARRAY_SIZE(iter->data);
+
+       while (n && __btree_node_iter_set_end(iter, n - 1))
+               --n;
+
+       return n;
+}
+
 struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
 {
        unsigned offset = __btree_node_key_to_offset(b, k);
@@ -110,7 +120,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
 {
        struct btree_node_iter_set *set;
 
-       printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+       printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+              __btree_node_iter_used(iter), b->nsets);
 
        btree_node_iter_for_each(iter, set) {
                struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
@@ -119,8 +130,8 @@ void bch2_dump_btree_node_iter(struct btree *b,
                char buf[100];
 
                bch2_bkey_to_text(&PBUF(buf), &uk);
-               printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
-                      k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+               printk(KERN_ERR "set %zu key %u: %s\n",
+                      t - b->set, set->k, buf);
        }
 }
 
@@ -182,8 +193,12 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
                                 struct btree *b)
 {
        struct btree_node_iter_set *set, *s2;
+       struct bkey_packed *k, *p;
        struct bset_tree *t;
 
+       if (bch2_btree_node_iter_end(iter))
+               return;
+
        /* Verify no duplicates: */
        btree_node_iter_for_each(iter, set)
                btree_node_iter_for_each(iter, s2)
@@ -204,6 +219,18 @@ found:
        btree_node_iter_for_each(iter, set)
                BUG_ON(set != iter->data &&
                       btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+       k = bch2_btree_node_iter_peek_all(iter, b);
+
+       for_each_bset(b, t) {
+               if (iter->data[0].end == t->end_offset)
+                       continue;
+
+               p = bch2_bkey_prev_all(b, t,
+                       bch2_btree_node_iter_bset_pos(iter, b, t));
+
+               BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+       }
 }
 
 void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
@@ -1669,25 +1696,13 @@ void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
        __bch2_btree_node_iter_advance(iter, b);
 }
 
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
-       unsigned n = ARRAY_SIZE(iter->data);
-
-       while (n && __btree_node_iter_set_end(iter, n - 1))
-               --n;
-
-       return n;
-}
-
 /*
  * Expensive:
  */
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
-                                                    struct btree *b,
-                                                    unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+                                                 struct btree *b)
 {
        struct bkey_packed *k, *prev = NULL;
-       struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
        struct btree_node_iter_set *set;
        struct bset_tree *t;
        unsigned end = 0;
@@ -1695,9 +1710,8 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
        bch2_btree_node_iter_verify(iter, b);
 
        for_each_bset(b, t) {
-               k = bch2_bkey_prev_filter(b, t,
-                       bch2_btree_node_iter_bset_pos(iter, b, t),
-                       min_key_type);
+               k = bch2_bkey_prev_all(b, t,
+                       bch2_btree_node_iter_bset_pos(iter, b, t));
                if (k &&
                    (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
                        prev = k;
@@ -1706,7 +1720,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *ite
        }
 
        if (!prev)
-               goto out;
+               return NULL;
 
        /*
         * We're manually memmoving instead of just calling sort() to ensure the
@@ -1727,18 +1741,20 @@ found:
 
        iter->data[0].k = __btree_node_key_to_offset(b, prev);
        iter->data[0].end = end;
-out:
-       if (btree_keys_expensive_checks(b)) {
-               struct btree_node_iter iter2 = *iter;
 
-               if (prev)
-                       __bch2_btree_node_iter_advance(&iter2, b);
+       bch2_btree_node_iter_verify(iter, b);
+       return prev;
+}
 
-               while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
-                       BUG_ON(k->type >= min_key_type);
-                       __bch2_btree_node_iter_advance(&iter2, b);
-               }
-       }
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+                                                    struct btree *b,
+                                                    unsigned min_key_type)
+{
+       struct bkey_packed *prev;
+
+       do {
+               prev = bch2_btree_node_iter_prev_all(iter, b);
+       } while (prev && prev->type < min_key_type);
 
        return prev;
 }
index 17c2399473002ad9b578b8b22f9dfe2359a89329..643bd9e8bc4d938c873f75067dff0560f1910fe1 100644 (file)
@@ -528,15 +528,11 @@ bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
        return ret;
 }
 
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+                                                 struct btree *);
 struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
                                                     struct btree *, unsigned);
 
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
-{
-       return bch2_btree_node_iter_prev_filter(iter, b, 0);
-}
-
 static inline struct bkey_packed *
 bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
 {
index a458cfe0e92d55f53fdca5099528752cb38f81f0..e43d48b8a34297f33a2eb196c7d8a27396d31d3a 100644 (file)
@@ -171,7 +171,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                *max_stale = max(*max_stale, ptr_stale(ca, ptr));
        }
 
-       bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
+       bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
 fsck_err:
        return ret;
 }
@@ -418,7 +418,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
        for_each_pending_btree_node_free(c, as, d)
                if (d->index_update_done)
-                       bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
+                       bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+                                     0, 0, NULL, 0,
                                      BCH_BUCKET_MARK_GC);
 
        mutex_unlock(&c->btree_interior_update_lock);
index 8955555d6603267fd491d15919ac9259957f7eca..a28d2dd7d5b3d2f23c8c600bf2e4cc5923565e1e 100644 (file)
@@ -86,7 +86,7 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
        struct btree_iter *linked;
        unsigned readers = 0;
 
-       EBUG_ON(btree_node_read_locked(iter, b->level));
+       EBUG_ON(!btree_node_intent_locked(iter, b->level));
 
        trans_for_each_iter(iter->trans, linked)
                if (linked->l[b->level].b == b &&
@@ -496,6 +496,23 @@ static inline void __bch2_btree_iter_verify(struct btree_iter *iter,
 
 #endif
 
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+                                       struct btree *b,
+                                       struct bset_tree *t,
+                                       struct bkey_packed *k)
+{
+       struct btree_node_iter_set *set;
+
+       btree_node_iter_for_each(iter, set)
+               if (set->end == t->end_offset) {
+                       set->k = __btree_node_key_to_offset(b, k);
+                       bch2_btree_node_iter_sort(iter, b);
+                       return;
+               }
+
+       bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
 static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
                                      struct btree *b,
                                      struct btree_node_iter *node_iter,
@@ -527,7 +544,8 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
                                bch2_btree_node_iter_peek_all(node_iter, b),
                                &iter->k);
        }
-       return;
+
+       goto iter_current_key_not_modified;
 found:
        set->end = t->end_offset;
 
@@ -569,60 +587,42 @@ found:
                        bkey_disassemble(l->b, k, &iter->k);
        }
 iter_current_key_not_modified:
-
        /*
-        * Interior nodes are special because iterators for interior nodes don't
-        * obey the usual invariants regarding the iterator position:
-        *
-        * We may have whiteouts that compare greater than the iterator
-        * position, and logically should be in the iterator, but that we
-        * skipped past to find the first live key greater than the iterator
-        * position. This becomes an issue when we insert a new key that is
-        * greater than the current iterator position, but smaller than the
-        * whiteouts we've already skipped past - this happens in the course of
-        * a btree split.
-        *
-        * We have to rewind the iterator past to before those whiteouts here,
-        * else bkey_node_iter_prev() is not going to work and who knows what
-        * else would happen. And we have to do it manually, because here we've
-        * already done the insert and the iterator is currently inconsistent:
-        *
-        * We've got multiple competing invariants, here - we have to be careful
-        * about rewinding iterators for interior nodes, because they should
-        * always point to the key for the child node the btree iterator points
-        * to.
+        * When a new key is added, and the node iterator now points to that
+        * key, the iterator might have skipped past deleted keys that should
+        * come after the key the iterator now points to. We have to rewind to
+        * before those deleted keys - otherwise bch2_btree_node_iter_prev_all()
+        * breaks:
         */
-       if (b->level && new_u64s &&
-           btree_iter_pos_cmp(iter, b, where) > 0) {
+       if (!bch2_btree_node_iter_end(node_iter) &&
+           (b->level ||
+            (iter->flags & BTREE_ITER_IS_EXTENTS))) {
                struct bset_tree *t;
-               struct bkey_packed *k;
+               struct bkey_packed *k, *k2, *p;
+
+               k = bch2_btree_node_iter_peek_all(node_iter, b);
 
                for_each_bset(b, t) {
-                       if (bch2_bkey_to_bset(b, where) == t)
+                       bool set_pos = false;
+
+                       if (node_iter->data[0].end == t->end_offset)
                                continue;
 
-                       k = bch2_bkey_prev_all(b, t,
-                               bch2_btree_node_iter_bset_pos(node_iter, b, t));
-                       if (k &&
-                           bkey_iter_cmp(b, k, where) > 0) {
-                               struct btree_node_iter_set *set;
-                               unsigned offset =
-                                       __btree_node_key_to_offset(b, bkey_next(k));
-
-                               btree_node_iter_for_each(node_iter, set)
-                                       if (set->k == offset) {
-                                               set->k = __btree_node_key_to_offset(b, k);
-                                               bch2_btree_node_iter_sort(node_iter, b);
-                                               goto next_bset;
-                                       }
-
-                               bch2_btree_node_iter_push(node_iter, b, k,
-                                               btree_bkey_last(b, t));
+                       k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+                       while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+                              bkey_iter_cmp(b, k, p) < 0) {
+                               k2 = p;
+                               set_pos = true;
                        }
-next_bset:
-                       t = t;
+
+                       if (set_pos)
+                               btree_node_iter_set_set_pos(node_iter,
+                                                           b, t, k2);
                }
        }
+
+       bch2_btree_node_iter_verify(node_iter, b);
 }
 
 void bch2_btree_node_iter_fix(struct btree_iter *iter,
@@ -1436,8 +1436,7 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
 
 recheck:
        while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
-              bkey_deleted(k.k) &&
-              bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+              bkey_cmp(k.k->p, iter->pos) <= 0)
                bch2_btree_node_iter_advance(&l->iter, l->b);
 
        /*
@@ -1477,6 +1476,8 @@ recheck:
                EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
                EBUG_ON(bkey_deleted(k.k));
                iter->uptodate = BTREE_ITER_UPTODATE;
+
+               __bch2_btree_iter_verify(iter, l->b);
                return k;
        }
 
@@ -1507,6 +1508,8 @@ recheck:
 
        iter->k = n;
        iter->uptodate = BTREE_ITER_UPTODATE;
+
+       __bch2_btree_iter_verify(iter, l->b);
        return (struct bkey_s_c) { &iter->k, NULL };
 }
 
@@ -1539,19 +1542,18 @@ recheck:
                goto recheck;
        }
 
-       if (k.k &&
-           !bkey_deleted(k.k) &&
-           !bkey_cmp(iter->pos, k.k->p)) {
-               iter->uptodate = BTREE_ITER_UPTODATE;
-               return k;
-       } else {
+       if (!k.k ||
+           bkey_deleted(k.k) ||
+           bkey_cmp(iter->pos, k.k->p)) {
                /* hole */
                bkey_init(&iter->k);
                iter->k.p = iter->pos;
-
-               iter->uptodate = BTREE_ITER_UPTODATE;
-               return (struct bkey_s_c) { &iter->k, NULL };
+               k = (struct bkey_s_c) { &iter->k, NULL };
        }
+
+       iter->uptodate = BTREE_ITER_UPTODATE;
+       __bch2_btree_iter_verify(iter, l->b);
+       return k;
 }
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
@@ -1779,6 +1781,12 @@ found:
 
                iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
                iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+               if ((iter->flags & BTREE_ITER_INTENT) &&
+                   !bch2_btree_iter_upgrade(iter, 1)) {
+                       trace_trans_restart_upgrade(trans->ip);
+                       return ERR_PTR(-EINTR);
+               }
        }
 
        BUG_ON(iter->btree_id != btree_id);
index 9483ec8913e3199afbbe897606bee48f148c0caf..249df21b9a97385094db8365f1c06bebcaea762b 100644 (file)
@@ -242,7 +242,7 @@ static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
                                            (_start), (_flags))) ?:     \
                      PTR_ERR_OR_ZERO(((_k) =                           \
                        __bch2_btree_iter_peek(_iter, _flags)).k);      \
-            !ret && (_k).k;                                            \
+            !_ret && (_k).k;                                           \
             (_ret) = PTR_ERR_OR_ZERO(((_k) =                           \
                        __bch2_btree_iter_next(_iter, _flags)).k))
 
index 91aa30a6ed2f8bc7c3b95fa7328b8b90376b6629..f4e1bfe129a0815a43682cca96bf0d3bd709106f 100644 (file)
@@ -461,7 +461,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
-       return type == BKEY_TYPE_EXTENTS;
+       switch (type) {
+       case BKEY_TYPE_EXTENTS:
+       case BKEY_TYPE_REFLINK:
+               return true;
+       default:
+               return false;
+       }
 }
 
 static inline bool btree_node_is_extents(struct btree *b)
@@ -477,6 +483,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
        case BKEY_TYPE_EXTENTS:
        case BKEY_TYPE_INODES:
        case BKEY_TYPE_EC:
+       case BKEY_TYPE_REFLINK:
                return true;
        default:
                return false;
index 9294137719df771235d3918456cc4ae840e6e9cd..6813eddd26f51dd9c6673cdace416e1c08d4f26c 100644 (file)
@@ -194,7 +194,7 @@ found:
                       : gc_pos_btree_root(as->btree_id)) >= 0 &&
            gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
                bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
-                             0, NULL, 0,
+                             0, 0, NULL, 0,
                              BCH_BUCKET_MARK_OVERWRITE|
                              BCH_BUCKET_MARK_GC);
 }
@@ -266,11 +266,12 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 {
        BUG_ON(!pending->index_update_done);
 
-       bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
-                     BCH_BUCKET_MARK_OVERWRITE);
+       bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+                     0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
 
        if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
-               bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+               bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+                             0, 0, NULL, 0,
                              BCH_BUCKET_MARK_OVERWRITE|
                              BCH_BUCKET_MARK_GC);
 }
@@ -1077,11 +1078,11 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
        fs_usage = bch2_fs_usage_scratch_get(c);
 
        bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-                     0, fs_usage, 0,
+                     0, 0, fs_usage, 0,
                      BCH_BUCKET_MARK_INSERT);
        if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
                bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
-                                    0, NULL, 0,
+                                    0, 0, NULL, 0,
                                     BCH_BUCKET_MARK_INSERT|
                                     BCH_BUCKET_MARK_GC);
 
@@ -1175,12 +1176,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        fs_usage = bch2_fs_usage_scratch_get(c);
 
        bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-                            0, fs_usage, 0,
+                            0, 0, fs_usage, 0,
                             BCH_BUCKET_MARK_INSERT);
 
        if (gc_visited(c, gc_pos_btree_node(b)))
                bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
-                                    0, NULL, 0,
+                                    0, 0, NULL, 0,
                                     BCH_BUCKET_MARK_INSERT|
                                     BCH_BUCKET_MARK_GC);
 
@@ -2003,11 +2004,11 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                fs_usage = bch2_fs_usage_scratch_get(c);
 
                bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-                             0, fs_usage, 0,
+                             0, 0, fs_usage, 0,
                              BCH_BUCKET_MARK_INSERT);
                if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
                        bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
-                                            0, NULL, 0,
+                                            0, 0, NULL, 0,
                                             BCH_BUCKET_MARK_INSERT||
                                             BCH_BUCKET_MARK_GC);
 
index 4f12108bd6fe6d120239542a035a382fad435cda..906e4999e10c94c638e25f0fc72289f1201c5dcb 100644 (file)
@@ -400,8 +400,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
                BUG_ON(i->iter->level);
                BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
                EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
-                       !bch2_extent_is_atomic(i->k, i->iter));
-
+                       bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
                EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
                        !(trans->flags & BTREE_INSERT_ATOMIC));
        }
@@ -522,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans,
 {
        return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
                (i->iter->btree_id == BTREE_ID_EXTENTS ||
-                i->iter->btree_id == BTREE_ID_INODES);
+                i->iter->btree_id == BTREE_ID_INODES ||
+                i->iter->btree_id == BTREE_ID_REFLINK);
 }
 
 static inline bool update_has_triggers(struct btree_trans *trans,
@@ -923,8 +923,6 @@ out_noupdates:
                bch2_trans_unlink_iters(trans, ~trans->iters_touched|
                                        trans->iters_unlink_on_commit);
                trans->iters_touched = 0;
-       } else {
-               bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
        }
        trans->nr_updates       = 0;
        trans->mem_top          = 0;
@@ -1033,7 +1031,10 @@ retry:
                        /* create the biggest key we can */
                        bch2_key_resize(&delete.k, max_sectors);
                        bch2_cut_back(end, &delete.k);
-                       bch2_extent_trim_atomic(&delete, iter);
+
+                       ret = bch2_extent_trim_atomic(&delete, iter);
+                       if (ret)
+                               break;
                }
 
                bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
index 81c3c3137825ed56de4fb34f8e5951118c9591e2..d6dcbf91cd80133f50c8b561cd6ede11281993fe 100644 (file)
@@ -405,7 +405,8 @@ int bch2_fs_usage_apply(struct bch_fs *c,
         */
        should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
        if (WARN_ONCE(should_not_have_added > 0,
-                     "disk usage increased without a reservation")) {
+                     "disk usage increased by %lli without a reservation",
+                     should_not_have_added)) {
                atomic64_sub(should_not_have_added, &c->sectors_available);
                added -= should_not_have_added;
                ret = -1;
@@ -810,23 +811,24 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 }
 
 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
-                                 s64 delta)
+                                 unsigned offset, s64 delta,
+                                 unsigned flags)
 {
-       if (delta > 0) {
-               /*
-                * marking a new extent, which _will have size_ @delta
-                *
-                * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
-                * case, we haven't actually created the key we'll be inserting
-                * yet (for the split) - so we don't want to be using
-                * k->size/crc.live_size here:
-                */
-               return __ptr_disk_sectors(p, delta);
+       if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+               BUG_ON(offset + -delta > p.crc.live_size);
+
+               return -((s64) ptr_disk_sectors(p)) +
+                       __ptr_disk_sectors(p, offset) +
+                       __ptr_disk_sectors(p, p.crc.live_size -
+                                          offset + delta);
+       } else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+               BUG_ON(offset + -delta > p.crc.live_size);
+
+               return -((s64) ptr_disk_sectors(p)) +
+                       __ptr_disk_sectors(p, p.crc.live_size +
+                                          delta);
        } else {
-               BUG_ON(-delta > p.crc.live_size);
-
-               return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
-                       (s64) ptr_disk_sectors(p);
+               return ptr_disk_sectors(p);
        }
 }
 
@@ -970,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
                spin_unlock(&c->ec_stripes_heap_lock);
                bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
                                    (u64) p.idx);
-               return -1;
+               return -EIO;
        }
 
        BUG_ON(m->r.e.data_type != data_type);
@@ -1005,7 +1007,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 }
 
 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
-                           s64 sectors, enum bch_data_type data_type,
+                           unsigned offset, s64 sectors,
+                           enum bch_data_type data_type,
                            struct bch_fs_usage *fs_usage,
                            unsigned journal_seq, unsigned flags)
 {
@@ -1026,7 +1029,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = data_type == BCH_DATA_BTREE
                        ? sectors
-                       : ptr_disk_sectors_delta(p, sectors);
+                       : ptr_disk_sectors_delta(p, offset, sectors, flags);
                bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
                                        fs_usage, journal_seq, flags);
 
@@ -1115,7 +1118,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
-                  struct bkey_s_c k, s64 sectors,
+                  struct bkey_s_c k,
+                  unsigned offset, s64 sectors,
                   struct bch_fs_usage *fs_usage,
                   u64 journal_seq, unsigned flags)
 {
@@ -1136,11 +1140,12 @@ int bch2_mark_key_locked(struct bch_fs *c,
                        ?  c->opts.btree_node_size
                        : -c->opts.btree_node_size;
 
-               ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE,
+               ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
                                fs_usage, journal_seq, flags);
                break;
        case KEY_TYPE_extent:
-               ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+       case KEY_TYPE_reflink_v:
+               ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
                                fs_usage, journal_seq, flags);
                break;
        case KEY_TYPE_stripe:
@@ -1171,14 +1176,14 @@ int bch2_mark_key_locked(struct bch_fs *c,
 }
 
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-                 s64 sectors,
+                 unsigned offset, s64 sectors,
                  struct bch_fs_usage *fs_usage,
                  u64 journal_seq, unsigned flags)
 {
        int ret;
 
        percpu_down_read(&c->mark_lock);
-       ret = bch2_mark_key_locked(c, k, sectors,
+       ret = bch2_mark_key_locked(c, k, offset, sectors,
                                   fs_usage, journal_seq, flags);
        percpu_up_read(&c->mark_lock);
 
@@ -1194,8 +1199,11 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
 {
        struct bch_fs           *c = trans->c;
        struct btree            *b = iter->l[0].b;
+       unsigned                offset = 0;
        s64                     sectors = 0;
 
+       flags |= BCH_BUCKET_MARK_OVERWRITE;
+
        if (btree_node_is_extents(b)
            ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
            : bkey_cmp(new->k.p, old.k->p))
@@ -1204,35 +1212,33 @@ inline int bch2_mark_overwrite(struct btree_trans *trans,
        if (btree_node_is_extents(b)) {
                switch (bch2_extent_overlap(&new->k, old.k)) {
                case BCH_EXTENT_OVERLAP_ALL:
+                       offset = 0;
                        sectors = -((s64) old.k->size);
                        break;
                case BCH_EXTENT_OVERLAP_BACK:
+                       offset = bkey_start_offset(&new->k) -
+                               bkey_start_offset(old.k);
                        sectors = bkey_start_offset(&new->k) -
                                old.k->p.offset;
                        break;
                case BCH_EXTENT_OVERLAP_FRONT:
+                       offset = 0;
                        sectors = bkey_start_offset(old.k) -
                                new->k.p.offset;
                        break;
                case BCH_EXTENT_OVERLAP_MIDDLE:
-                       sectors = old.k->p.offset - new->k.p.offset;
-                       BUG_ON(sectors <= 0);
-
-                       bch2_mark_key_locked(c, old, sectors,
-                               fs_usage, trans->journal_res.seq,
-                               BCH_BUCKET_MARK_INSERT|flags);
-
-                       sectors = bkey_start_offset(&new->k) -
-                               old.k->p.offset;
+                       offset = bkey_start_offset(&new->k) -
+                               bkey_start_offset(old.k);
+                       sectors = -((s64) new->k.size);
+                       flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
                        break;
                }
 
                BUG_ON(sectors >= 0);
        }
 
-       return bch2_mark_key_locked(c, old, sectors, fs_usage,
-                                   trans->journal_res.seq,
-                                   BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1;
+       return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
+                                   trans->journal_res.seq, flags) ?: 1;
 }
 
 int bch2_mark_update(struct btree_trans *trans,
@@ -1252,8 +1258,7 @@ int bch2_mark_update(struct btree_trans *trans,
 
        if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
                bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
-                       bpos_min(insert->k->k.p, b->key.k.p).offset -
-                       bkey_start_offset(&insert->k->k),
+                       0, insert->k->k.size,
                        fs_usage, trans->journal_res.seq,
                        BCH_BUCKET_MARK_INSERT|flags);
 
@@ -1300,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
            xchg(&warned_disk_usage, 1))
                return;
 
-       pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+       bch_err(c, "disk usage increased more than %llu sectors reserved",
+               disk_res_sectors);
 
        trans_for_each_update_iter(trans, i) {
                struct btree_iter       *iter = i->iter;
@@ -1315,7 +1321,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
                node_iter = iter->l[0].iter;
                while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-                                                             KEY_TYPE_discard))) {
+                                                       KEY_TYPE_discard))) {
                        struct bkey             unpacked;
                        struct bkey_s_c         k;
 
@@ -1341,15 +1347,20 @@ static int trans_get_key(struct btree_trans *trans,
                         struct btree_iter **iter,
                         struct bkey_s_c *k)
 {
-       unsigned i;
+       struct btree_insert_entry *i;
        int ret;
 
-       for (i = 0; i < trans->nr_updates; i++)
-               if (!trans->updates[i].deferred &&
-                   trans->updates[i].iter->btree_id == btree_id &&
-                   !bkey_cmp(pos, trans->updates[i].iter->pos)) {
-                       *iter   = trans->updates[i].iter;
-                       *k      = bkey_i_to_s_c(trans->updates[i].k);
+       for (i = trans->updates;
+            i < trans->updates + trans->nr_updates;
+            i++)
+               if (!i->deferred &&
+                   i->iter->btree_id == btree_id &&
+                   (btree_node_type_is_extents(btree_id)
+                    ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
+                      bkey_cmp(pos, i->k->k.p) < 0
+                    : !bkey_cmp(pos, i->iter->pos))) {
+                       *iter   = i->iter;
+                       *k      = bkey_i_to_s_c(i->k);
                        return 0;
                }
 
@@ -1358,6 +1369,8 @@ static int trans_get_key(struct btree_trans *trans,
        if (IS_ERR(*iter))
                return PTR_ERR(*iter);
 
+       bch2_trans_iter_free_on_commit(trans, *iter);
+
        *k = bch2_btree_iter_peek_slot(*iter);
        ret = bkey_err(*k);
        if (ret)
@@ -1460,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                        struct bch_extent_stripe_ptr p,
                        s64 sectors, enum bch_data_type data_type)
 {
+       struct bch_fs *c = trans->c;
        struct bch_replicas_padded r;
        struct btree_iter *iter;
        struct bkey_i *new_k;
@@ -1476,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                return ret;
 
        if (k.k->type != KEY_TYPE_stripe) {
-               bch_err_ratelimited(trans->c,
-                                   "pointer to nonexistent stripe %llu",
-                                   (u64) p.idx);
-               ret = -1;
+               bch2_fs_inconsistent(c,
+                       "pointer to nonexistent stripe %llu",
+                       (u64) p.idx);
+               ret = -EIO;
                goto out;
        }
 
@@ -1511,8 +1525,9 @@ out:
 }
 
 static int bch2_trans_mark_extent(struct btree_trans *trans,
-                       struct bkey_s_c k,
-                       s64 sectors, enum bch_data_type data_type)
+                       struct bkey_s_c k, unsigned offset,
+                       s64 sectors, unsigned flags,
+                       enum bch_data_type data_type)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
@@ -1532,7 +1547,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = data_type == BCH_DATA_BTREE
                        ? sectors
-                       : ptr_disk_sectors_delta(p, sectors);
+                       : ptr_disk_sectors_delta(p, offset, sectors, flags);
 
                ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
                                              data_type);
@@ -1566,8 +1581,86 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                       struct bkey_s_c_reflink_p p,
+                       u64 idx, unsigned sectors,
+                       unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter;
+       struct bkey_i *new_k;
+       struct bkey_s_c k;
+       struct bkey_i_reflink_v *r_v;
+       s64 ret;
+
+       ret = trans_get_key(trans, BTREE_ID_REFLINK,
+                           POS(0, idx), &iter, &k);
+       if (ret)
+               return ret;
+
+       if (k.k->type != KEY_TYPE_reflink_v) {
+               bch2_fs_inconsistent(c,
+                       "%llu:%llu len %u points to nonexistent indirect extent %llu",
+                       p.k->p.inode, p.k->p.offset, p.k->size, idx);
+               ret = -EIO;
+               goto err;
+       }
+
+       if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+           (bkey_start_offset(k.k) < idx ||
+            k.k->p.offset > idx + sectors))
+               goto out;
+
+       bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+       BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+       new_k = trans_update_key(trans, iter, k.k->u64s);
+       ret = PTR_ERR_OR_ZERO(new_k);
+       if (ret)
+               goto err;
+
+       bkey_reassemble(new_k, k);
+       r_v = bkey_i_to_reflink_v(new_k);
+
+       le64_add_cpu(&r_v->v.refcount,
+                    !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+
+       if (!r_v->v.refcount) {
+               r_v->k.type = KEY_TYPE_deleted;
+               set_bkey_val_u64s(&r_v->k, 0);
+       }
+out:
+       ret = k.k->p.offset - idx;
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                       struct bkey_s_c_reflink_p p, unsigned offset,
                        s64 sectors, unsigned flags)
+{
+       u64 idx = le64_to_cpu(p.v->idx) + offset;
+       s64 ret = 0;
+
+       sectors = abs(sectors);
+       BUG_ON(offset + sectors > p.k->size);
+
+       while (sectors) {
+               ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+               if (ret < 0)
+                       break;
+
+               idx += ret;
+               sectors = max_t(s64, 0LL, sectors - ret);
+               ret = 0;
+       }
+
+       return ret;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+                       unsigned offset, s64 sectors, unsigned flags)
 {
        struct replicas_delta_list *d;
        struct bch_fs *c = trans->c;
@@ -1578,11 +1671,12 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
                        ?  c->opts.btree_node_size
                        : -c->opts.btree_node_size;
 
-               return bch2_trans_mark_extent(trans, k, sectors,
-                                             BCH_DATA_BTREE);
+               return bch2_trans_mark_extent(trans, k, offset, sectors,
+                                             flags, BCH_DATA_BTREE);
        case KEY_TYPE_extent:
-               return bch2_trans_mark_extent(trans, k, sectors,
-                                             BCH_DATA_USER);
+       case KEY_TYPE_reflink_v:
+               return bch2_trans_mark_extent(trans, k, offset, sectors,
+                                             flags, BCH_DATA_USER);
        case KEY_TYPE_inode:
                d = replicas_deltas_realloc(trans, 0);
 
@@ -1604,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
                d->fs_usage.persistent_reserved[replicas - 1]   += sectors;
                return 0;
        }
+       case KEY_TYPE_reflink_p:
+               return bch2_trans_mark_reflink_p(trans,
+                                       bkey_s_c_to_reflink_p(k),
+                                       offset, sectors, flags);
        default:
                return 0;
        }
@@ -1621,11 +1719,8 @@ int bch2_trans_mark_update(struct btree_trans *trans,
        if (!btree_node_type_needs_gc(iter->btree_id))
                return 0;
 
-       ret = bch2_trans_mark_key(trans,
-                       bkey_i_to_s_c(insert),
-                       bpos_min(insert->k.p, b->key.k.p).offset -
-                       bkey_start_offset(&insert->k),
-                       BCH_BUCKET_MARK_INSERT);
+       ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
+                       0, insert->k.size, BCH_BUCKET_MARK_INSERT);
        if (ret)
                return ret;
 
@@ -1633,7 +1728,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
                                                      KEY_TYPE_discard))) {
                struct bkey             unpacked;
                struct bkey_s_c         k;
+               unsigned                offset = 0;
                s64                     sectors = 0;
+               unsigned                flags = BCH_BUCKET_MARK_OVERWRITE;
 
                k = bkey_disassemble(b, _k, &unpacked);
 
@@ -1645,35 +1742,32 @@ int bch2_trans_mark_update(struct btree_trans *trans,
                if (btree_node_is_extents(b)) {
                        switch (bch2_extent_overlap(&insert->k, k.k)) {
                        case BCH_EXTENT_OVERLAP_ALL:
+                               offset = 0;
                                sectors = -((s64) k.k->size);
                                break;
                        case BCH_EXTENT_OVERLAP_BACK:
+                               offset = bkey_start_offset(&insert->k) -
+                                       bkey_start_offset(k.k);
                                sectors = bkey_start_offset(&insert->k) -
                                        k.k->p.offset;
                                break;
                        case BCH_EXTENT_OVERLAP_FRONT:
+                               offset = 0;
                                sectors = bkey_start_offset(k.k) -
                                        insert->k.p.offset;
                                break;
                        case BCH_EXTENT_OVERLAP_MIDDLE:
-                               sectors = k.k->p.offset - insert->k.p.offset;
-                               BUG_ON(sectors <= 0);
-
-                               ret = bch2_trans_mark_key(trans, k, sectors,
-                                               BCH_BUCKET_MARK_INSERT);
-                               if (ret)
-                                       return ret;
-
-                               sectors = bkey_start_offset(&insert->k) -
-                                       k.k->p.offset;
+                               offset = bkey_start_offset(&insert->k) -
+                                       bkey_start_offset(k.k);
+                               sectors = -((s64) insert->k.size);
+                               flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
                                break;
                        }
 
                        BUG_ON(sectors >= 0);
                }
 
-               ret = bch2_trans_mark_key(trans, k, sectors,
-                                         BCH_BUCKET_MARK_OVERWRITE);
+               ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
                if (ret)
                        return ret;
 
index 5ab6f3d3413718218ddf060473f05d53ce23a3c0..799bfb3c96d8a902c136c91c48a3ae45e4f8e952 100644 (file)
@@ -251,14 +251,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 #define BCH_BUCKET_MARK_INSERT                 (1 << 0)
 #define BCH_BUCKET_MARK_OVERWRITE              (1 << 1)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE      (1 << 2)
-#define BCH_BUCKET_MARK_GC                     (1 << 3)
-#define BCH_BUCKET_MARK_ALLOC_READ             (1 << 4)
-#define BCH_BUCKET_MARK_NOATOMIC               (1 << 5)
+#define BCH_BUCKET_MARK_OVERWRITE_SPLIT                (1 << 2)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE      (1 << 3)
+#define BCH_BUCKET_MARK_GC                     (1 << 4)
+#define BCH_BUCKET_MARK_ALLOC_READ             (1 << 5)
+#define BCH_BUCKET_MARK_NOATOMIC               (1 << 6)
 
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
                         struct bch_fs_usage *, u64, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
                  struct bch_fs_usage *, u64, unsigned);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                        struct disk_reservation *, unsigned);
@@ -272,7 +273,8 @@ int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
 void bch2_replicas_delta_list_apply(struct bch_fs *,
                                    struct bch_fs_usage *,
                                    struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+                       unsigned, s64, unsigned);
 int bch2_trans_mark_update(struct btree_trans *,
                           struct btree_iter *iter,
                           struct bkey_i *insert);
index 848f5dcbe0ae58dae8210ff388f2200a2e6e0356..bdb18c2a0b5c9f903958b6c2b246095c7aeae189 100644 (file)
@@ -162,19 +162,20 @@ static int extent_matches_stripe(struct bch_fs *c,
                                 struct bch_stripe *v,
                                 struct bkey_s_c k)
 {
-       struct bkey_s_c_extent e;
-       const struct bch_extent_ptr *ptr;
-       int idx;
 
-       if (!bkey_extent_is_data(k.k))
-               return -1;
-
-       e = bkey_s_c_to_extent(k);
+       switch (k.k->type) {
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const struct bch_extent_ptr *ptr;
+               int idx;
 
-       extent_for_each_ptr(e, ptr) {
-               idx = ptr_matches_stripe(c, v, ptr);
-               if (idx >= 0)
-                       return idx;
+               extent_for_each_ptr(e, ptr) {
+                       idx = ptr_matches_stripe(c, v, ptr);
+                       if (idx >= 0)
+                               return idx;
+               }
+               break;
+       }
        }
 
        return -1;
@@ -182,19 +183,20 @@ static int extent_matches_stripe(struct bch_fs *c,
 
 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
 {
-       struct bkey_s_c_extent e;
-       const union bch_extent_entry *entry;
+       switch (k.k->type) {
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
 
-       if (!bkey_extent_is_data(k.k))
-               return false;
+               extent_for_each_entry(e, entry)
+                       if (extent_entry_type(entry) ==
+                           BCH_EXTENT_ENTRY_stripe_ptr &&
+                           entry->stripe_ptr.idx == idx)
+                               return true;
 
-       e = bkey_s_c_to_extent(k);
-
-       extent_for_each_entry(e, entry)
-               if (extent_entry_type(entry) ==
-                   BCH_EXTENT_ENTRY_stripe_ptr &&
-                   entry->stripe_ptr.idx == idx)
-                       return true;
+               break;
+       }
+       }
 
        return false;
 }
@@ -1310,7 +1312,7 @@ int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
                        break;
                }
 
-               bch2_mark_key(c, k, 0, NULL, 0,
+               bch2_mark_key(c, k, 0, 0, NULL, 0,
                              BCH_BUCKET_MARK_ALLOC_READ|
                              BCH_BUCKET_MARK_NOATOMIC);
        }
index e286048b5bf83fd2e3ac29f42447e6c42c55be6c..5c6bae55d42cae18d7b90a755998211d6c4ec5eb 100644 (file)
@@ -250,6 +250,33 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
        bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+
+       bkey_for_each_ptr(ptrs, ptr)
+               if (ptr->dev == dev)
+                       return ptr;
+
+       return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+
+       bkey_for_each_ptr(ptrs, ptr)
+               if (bch2_dev_in_target(c, ptr->dev, target) &&
+                   (!ptr->cached ||
+                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+                       return true;
+
+       return false;
+}
+
 /* extent specific utility code */
 
 const struct bch_extent_ptr *
@@ -280,20 +307,6 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
        return NULL;
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
-{
-       const struct bch_extent_ptr *ptr;
-
-       extent_for_each_ptr(e, ptr)
-               if (bch2_dev_in_target(c, ptr->dev, target) &&
-                   (!ptr->cached ||
-                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-                       return ptr;
-
-       return NULL;
-}
-
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
        unsigned ret = 0;
@@ -314,16 +327,17 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
        return ret;
 }
 
-bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
-                            struct bch_extent_ptr m, u64 offset)
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+                          struct bch_extent_ptr m, u64 offset)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
 
-       extent_for_each_ptr_decode(e, p, entry)
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                if (p.ptr.dev   == m.dev &&
                    p.ptr.gen   == m.gen &&
-                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
+                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
                    (s64) m.offset  - offset)
                        return true;
 
@@ -390,16 +404,17 @@ static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
                bch2_csum_type_is_encryption(n.csum_type);
 }
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
                                 struct bch_extent_crc_unpacked n)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        struct bch_extent_crc_unpacked crc;
        const union bch_extent_entry *i;
 
        if (!n.csum_type)
                return false;
 
-       extent_for_each_crc(e, crc, i)
+       bkey_for_each_crc(k.k, ptrs, crc, i)
                if (can_narrow_crc(crc, n))
                        return true;
 
@@ -415,9 +430,9 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
  * currently live (so that readers won't have to bounce) while we've got the
  * checksum we need:
  */
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
-                            struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
 {
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
        struct bch_extent_crc_unpacked u;
        struct extent_ptr_decoded p;
        union bch_extent_entry *i;
@@ -425,7 +440,7 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 
        /* Find a checksum entry that covers only live data: */
        if (!n.csum_type) {
-               extent_for_each_crc(extent_i_to_s(e), u, i)
+               bkey_for_each_crc(&k->k, ptrs, u, i)
                        if (!u.compression_type &&
                            u.csum_type &&
                            u.live_size == u.uncompressed_size) {
@@ -437,15 +452,15 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
 found:
        BUG_ON(n.compression_type);
        BUG_ON(n.offset);
-       BUG_ON(n.live_size != e->k.size);
+       BUG_ON(n.live_size != k->k.size);
 
 restart_narrow_pointers:
-       extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+       bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
                if (can_narrow_crc(p.crc, n)) {
-                       bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
+                       bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
                        p.ptr.offset += p.crc.offset;
                        p.crc = n;
-                       bch2_extent_ptr_decoded_append(e, &p);
+                       bch2_extent_ptr_decoded_append(k, &p);
                        ret = true;
                        goto restart_narrow_pointers;
                }
@@ -708,44 +723,48 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 
 /* Extents */
 
-bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+void __bch2_cut_front(struct bpos where, struct bkey_s k)
 {
-       u64 len = 0;
+       u64 sub;
 
        if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-               return false;
+               return;
 
        EBUG_ON(bkey_cmp(where, k.k->p) > 0);
 
-       len = k.k->p.offset - where.offset;
+       sub = where.offset - bkey_start_offset(k.k);
 
-       BUG_ON(len > k.k->size);
+       k.k->size -= sub;
 
-       /*
-        * Don't readjust offset if the key size is now 0, because that could
-        * cause offset to point to the next bucket:
-        */
-       if (!len)
+       if (!k.k->size)
                k.k->type = KEY_TYPE_deleted;
-       else if (bkey_extent_is_data(k.k)) {
-               struct bkey_s_extent e = bkey_s_to_extent(k);
+
+       switch (k.k->type) {
+       case KEY_TYPE_deleted:
+       case KEY_TYPE_discard:
+       case KEY_TYPE_error:
+       case KEY_TYPE_cookie:
+               break;
+       case KEY_TYPE_extent:
+       case KEY_TYPE_reflink_v: {
+               struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
                union bch_extent_entry *entry;
                bool seen_crc = false;
 
-               extent_for_each_entry(e, entry) {
+               bkey_extent_entry_for_each(ptrs, entry) {
                        switch (extent_entry_type(entry)) {
                        case BCH_EXTENT_ENTRY_ptr:
                                if (!seen_crc)
-                                       entry->ptr.offset += e.k->size - len;
+                                       entry->ptr.offset += sub;
                                break;
                        case BCH_EXTENT_ENTRY_crc32:
-                               entry->crc32.offset += e.k->size - len;
+                               entry->crc32.offset += sub;
                                break;
                        case BCH_EXTENT_ENTRY_crc64:
-                               entry->crc64.offset += e.k->size - len;
+                               entry->crc64.offset += sub;
                                break;
                        case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.offset += e.k->size - len;
+                               entry->crc128.offset += sub;
                                break;
                        case BCH_EXTENT_ENTRY_stripe_ptr:
                                break;
@@ -754,11 +773,20 @@ bool __bch2_cut_front(struct bpos where, struct bkey_s k)
                        if (extent_entry_is_crc(entry))
                                seen_crc = true;
                }
-       }
 
-       k.k->size = len;
+               break;
+       }
+       case KEY_TYPE_reflink_p: {
+               struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
 
-       return true;
+               le64_add_cpu(&p.v->idx, sub);
+               break;
+       }
+       case KEY_TYPE_reservation:
+               break;
+       default:
+               BUG();
+       }
 }
 
 bool bch2_cut_back(struct bpos where, struct bkey *k)
@@ -772,8 +800,6 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
 
        len = where.offset - bkey_start_offset(k);
 
-       BUG_ON(len > k->size);
-
        k->p = where;
        k->size = len;
 
@@ -897,6 +923,16 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
            bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
                return;
 
+       /*
+        * may have skipped past some deleted extents greater than the insert
+        * key, before we got to a non deleted extent and knew we could bail out
+        * rewind the iterator a bit if necessary:
+        */
+       node_iter = l->iter;
+       while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+              bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
+               l->iter = node_iter;
+
        k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
        bch2_bset_insert(l->b, &l->iter, k, insert, 0);
@@ -921,47 +957,131 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
        return ret;
 }
 
-static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
+static int __bch2_extent_atomic_end(struct btree_trans *trans,
+                                   struct bkey_s_c k,
+                                   unsigned offset,
+                                   struct bpos *end,
+                                   unsigned *nr_iters,
+                                   unsigned max_iters)
+{
+       int ret = 0;
+
+       switch (k.k->type) {
+       case KEY_TYPE_extent:
+               *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+               if (*nr_iters >= max_iters) {
+                       *end = bpos_min(*end, k.k->p);
+                       return 0;
+               }
+
+               break;
+       case KEY_TYPE_reflink_p: {
+               struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+               u64 idx = le64_to_cpu(p.v->idx);
+               unsigned sectors = end->offset - bkey_start_offset(p.k);
+               struct btree_iter *iter;
+               struct bkey_s_c r_k;
+
+               for_each_btree_key(trans, iter,
+                                  BTREE_ID_REFLINK, POS(0, idx + offset),
+                                  BTREE_ITER_SLOTS, r_k, ret) {
+                       if (bkey_cmp(bkey_start_pos(r_k.k),
+                                    POS(0, idx + sectors)) >= 0)
+                               break;
+
+                       *nr_iters += 1;
+                       if (*nr_iters >= max_iters) {
+                               struct bpos pos = bkey_start_pos(k.k);
+                               pos.offset += r_k.k->p.offset - idx;
+
+                               *end = bpos_min(*end, pos);
+                               break;
+                       }
+               }
+
+               bch2_trans_iter_put(trans, iter);
+               break;
+       }
+       }
+
+       return ret;
+}
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+                          struct btree_iter *iter,
+                          struct bkey_i *insert,
+                          struct bpos *end)
 {
        struct btree *b = iter->l[0].b;
        struct btree_node_iter  node_iter = iter->l[0].iter;
        struct bkey_packed      *_k;
-       unsigned                nr_alloc_ptrs =
+       unsigned                nr_iters =
                bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
+       int ret = 0;
 
        BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
        BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
 
-       while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+       *end = bpos_min(insert->k.p, b->key.k.p);
+
+       ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
+                                      0, end, &nr_iters, 10);
+       if (ret)
+               return ret;
+
+       while (nr_iters < 20 &&
+              (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
                                                      KEY_TYPE_discard))) {
                struct bkey     unpacked;
                struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+               unsigned offset = 0;
 
-               if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+               if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
                        break;
 
-               nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+               if (bkey_cmp(bkey_start_pos(&insert->k),
+                            bkey_start_pos(k.k)) > 0)
+                       offset = bkey_start_offset(&insert->k) -
+                               bkey_start_offset(k.k);
 
-               if (nr_alloc_ptrs > 20) {
-                       BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
-                       return bpos_min(insert->k.p, k.k->p);
-               }
+               ret = __bch2_extent_atomic_end(trans, k, offset,
+                                              end, &nr_iters, 20);
+               if (ret)
+                       return ret;
+
+               if (nr_iters >= 20)
+                       break;
 
                bch2_btree_node_iter_advance(&node_iter, b);
        }
 
-       return bpos_min(insert->k.p, b->key.k.p);
+       return 0;
 }
 
-void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-       bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k);
+       struct bpos end;
+       int ret;
+
+       ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+       if (ret)
+               return ret;
+
+       bch2_cut_back(end, &k->k);
+       return 0;
 }
 
-bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-       return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p);
+       struct bpos end;
+       int ret;
+
+       ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+       if (ret)
+               return ret;
+
+       return !bkey_cmp(end, k->k.p);
 }
 
 enum btree_insert_ret
@@ -1185,19 +1305,6 @@ next:
                    overlap == BCH_EXTENT_OVERLAP_MIDDLE)
                        break;
        }
-
-       /*
-        * may have skipped past some deleted extents greater than the insert
-        * key, before we got to a non deleted extent and knew we could bail out
-        * rewind the iterator a bit if necessary:
-        */
-       {
-               struct btree_node_iter node_iter = l->iter;
-
-               while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
-                      bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0)
-                       l->iter = node_iter;
-       }
 }
 
 /**
@@ -1394,9 +1501,12 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
 #undef set_common_fields
 }
 
-static void bch2_extent_crc_init(union bch_extent_crc *crc,
-                                struct bch_extent_crc_unpacked new)
+static void bch2_extent_crc_append(struct bkey_i *k,
+                                  struct bch_extent_crc_unpacked new)
 {
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+       union bch_extent_crc *crc = (void *) ptrs.end;
+
        if (bch_crc_bytes[new.csum_type]        <= 4 &&
            new.uncompressed_size - 1           <= CRC32_SIZE_MAX &&
            new.nonce                           <= CRC32_NONCE_MAX)
@@ -1413,54 +1523,53 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc,
                BUG();
 
        bch2_extent_crc_pack(crc, new);
-}
 
-void bch2_extent_crc_append(struct bkey_i_extent *e,
-                           struct bch_extent_crc_unpacked new)
-{
-       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
-       __extent_entry_push(e);
+       k->k.u64s += extent_entry_u64s(ptrs.end);
+
+       EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
 }
 
-static inline void __extent_entry_insert(struct bkey_i_extent *e,
+static inline void __extent_entry_insert(struct bkey_i *k,
                                         union bch_extent_entry *dst,
                                         union bch_extent_entry *new)
 {
-       union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
+       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
 
        memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
                        dst, (u64 *) end - (u64 *) dst);
-       e->k.u64s += extent_entry_u64s(new);
+       k->k.u64s += extent_entry_u64s(new);
        memcpy(dst, new, extent_entry_bytes(new));
 }
 
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
                                    struct extent_ptr_decoded *p)
 {
-       struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+       struct bch_extent_crc_unpacked crc =
+               bch2_extent_crc_unpack(&k->k, NULL);
        union bch_extent_entry *pos;
        unsigned i;
 
        if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
-               pos = e->v.start;
+               pos = ptrs.start;
                goto found;
        }
 
-       extent_for_each_crc(extent_i_to_s(e), crc, pos)
+       bkey_for_each_crc(&k->k, ptrs, crc, pos)
                if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
                        pos = extent_entry_next(pos);
                        goto found;
                }
 
-       bch2_extent_crc_append(e, p->crc);
-       pos = extent_entry_last(extent_i_to_s(e));
+       bch2_extent_crc_append(k, p->crc);
+       pos = bkey_val_end(bkey_i_to_s(k));
 found:
        p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-       __extent_entry_insert(e, pos, to_entry(&p->ptr));
+       __extent_entry_insert(k, pos, to_entry(&p->ptr));
 
        for (i = 0; i < p->ec_nr; i++) {
                p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
-               __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+               __extent_entry_insert(k, pos, to_entry(&p->ec[i]));
        }
 }
 
@@ -1487,17 +1596,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
        return false;
 }
 
-void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-                                     struct bkey_s_extent e,
-                                     unsigned target,
-                                     unsigned nr_desired_replicas)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+                                   unsigned target,
+                                   unsigned nr_desired_replicas)
 {
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
        union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
+       int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
 
        if (target && extra > 0)
-               extent_for_each_ptr_decode(e, p, entry) {
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                        int n = bch2_extent_ptr_durability(c, p);
 
                        if (n && n <= extra &&
@@ -1508,7 +1617,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                }
 
        if (extra > 0)
-               extent_for_each_ptr_decode(e, p, entry) {
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                        int n = bch2_extent_ptr_durability(c, p);
 
                        if (n && n <= extra) {
index fe92737354bd202815ea70fb627b383824911520..6fddbaceb355634f07421da3496dffd61b9ad9d5 100644 (file)
@@ -12,7 +12,8 @@ struct btree_insert_entry;
 
 /* extent entries: */
 
-#define extent_entry_last(_e)          bkey_val_end(_e)
+#define extent_entry_last(_e)                                          \
+       ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
 
 #define entry_to_ptr(_entry)                                           \
 ({                                                                     \
@@ -258,6 +259,27 @@ out:                                                                       \
        __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,            \
                                   _ptr, _entry)
 
+#define bkey_crc_next(_k, _start, _end, _crc, _iter)                   \
+({                                                                     \
+       __bkey_extent_entry_for_each_from(_iter, _end, _iter)           \
+               if (extent_entry_is_crc(_iter)) {                       \
+                       (_crc) = bch2_extent_crc_unpack(_k,             \
+                                               entry_to_crc(_iter));   \
+                       break;                                          \
+               }                                                       \
+                                                                       \
+       (_iter) < (_end);                                               \
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter)             \
+       for ((_crc) = bch2_extent_crc_unpack(_k, NULL),                 \
+            (_iter) = (_start);                                        \
+            bkey_crc_next(_k, _start, _end, _crc, _iter);              \
+            (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter)                         \
+       __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
 /* utility code common to all keys with pointers: */
 
 static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
@@ -267,7 +289,7 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
                struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
                return (struct bkey_ptrs_c) {
                        to_entry(&e.v->start[0]),
-                       to_entry(bkey_val_end(e))
+                       to_entry(extent_entry_last(e))
                };
        }
        case KEY_TYPE_extent: {
@@ -284,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
                        to_entry(&s.v->ptrs[s.v->nr_blocks]),
                };
        }
+       case KEY_TYPE_reflink_v: {
+               struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+               return (struct bkey_ptrs_c) {
+                       r.v->start,
+                       bkey_val_end(r),
+               };
+       }
        default:
                return (struct bkey_ptrs_c) { NULL, NULL };
        }
@@ -337,18 +367,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
        return ret;
 }
 
-static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
-{
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-
-       bkey_for_each_ptr(p, ptr)
-               if (ptr->dev == dev)
-                       return ptr;
-
-       return NULL;
-}
-
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -359,6 +377,11 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
                               struct bch_io_failures *,
                               struct extent_ptr_decoded *);
 
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -410,8 +433,10 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
        .key_merge      = bch2_reservation_merge,               \
 }
 
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+                          struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
 
 enum btree_insert_ret
 bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
@@ -419,52 +444,46 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
 void bch2_insert_fixup_extent(struct btree_trans *,
                              struct btree_insert_entry *);
 
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-                                     unsigned, unsigned);
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+                                   unsigned, unsigned);
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
-                            struct bch_extent_ptr, u64);
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+                          struct bch_extent_ptr, u64);
 
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
        switch (k->type) {
        case KEY_TYPE_btree_ptr:
        case KEY_TYPE_extent:
+       case KEY_TYPE_reflink_p:
+       case KEY_TYPE_reflink_v:
                return true;
        default:
                return false;
        }
 }
 
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
 static inline bool bkey_extent_is_allocation(const struct bkey *k)
 {
        switch (k->type) {
        case KEY_TYPE_extent:
        case KEY_TYPE_reservation:
+       case KEY_TYPE_reflink_p:
+       case KEY_TYPE_reflink_v:
                return true;
        default:
                return false;
        }
 }
 
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
-       return bkey_extent_is_allocation(k.k) &&
-               !bch2_extent_is_compressed(k);
-}
-
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
 /* Extent entry iteration: */
 
 #define extent_for_each_entry_from(_e, _entry, _start)                 \
@@ -480,45 +499,16 @@ void bch2_bkey_drop_device(struct bkey_s, unsigned);
 #define extent_for_each_ptr(_e, _ptr)                                  \
        __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
-#define extent_crc_next(_e, _crc, _iter)                               \
-({                                                                     \
-       extent_for_each_entry_from(_e, _iter, _iter)                    \
-               if (extent_entry_is_crc(_iter)) {                       \
-                       (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
-                       break;                                          \
-               }                                                       \
-                                                                       \
-       (_iter) < extent_entry_last(_e);                                \
-})
-
-#define extent_for_each_crc(_e, _crc, _iter)                           \
-       for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),             \
-            (_iter) = (_e).v->start;                                   \
-            extent_crc_next(_e, _crc, _iter);                          \
-            (_iter) = extent_entry_next(_iter))
-
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)                   \
        __bkey_for_each_ptr_decode((_e).k, (_e).v->start,               \
                                   extent_entry_last(_e), _ptr, _entry)
 
-void bch2_extent_crc_append(struct bkey_i_extent *,
-                           struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
 
-static inline void __extent_entry_push(struct bkey_i_extent *e)
-{
-       union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
-       EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
-               BKEY_EXTENT_VAL_U64s_MAX);
-
-       e->k.u64s += extent_entry_u64s(entry);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
                                 struct bch_extent_crc_unpacked);
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
 
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
@@ -540,11 +530,11 @@ do {                                                                      \
        }                                                               \
 } while (0)
 
-bool __bch2_cut_front(struct bpos, struct bkey_s);
+void __bch2_cut_front(struct bpos, struct bkey_s);
 
-static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
 {
-       return __bch2_cut_front(where, bkey_i_to_s(k));
+       __bch2_cut_front(where, bkey_i_to_s(k));
 }
 
 bool bch2_cut_back(struct bpos, struct bkey *);
index 5d0c2b696c1ec8ecbd3617fc003880c73f4fc390..d8113b292eb0ff67b380bfb2c0d40bda11ab0ee7 100644 (file)
@@ -16,6 +16,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "quota.h"
+#include "reflink.h"
 
 #include <linux/aio.h>
 #include <linux/backing-dev.h>
@@ -193,9 +194,9 @@ static int inode_set_size(struct bch_inode_info *inode,
        return 0;
 }
 
-static int __must_check bch2_write_inode_size(struct bch_fs *c,
-                                             struct bch_inode_info *inode,
-                                             loff_t new_size, unsigned fields)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      loff_t new_size, unsigned fields)
 {
        struct inode_new_size s = {
                .new_size       = new_size,
@@ -277,16 +278,16 @@ static int sum_sector_overwrites(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_extent_update(struct btree_trans *trans,
-                             struct bch_inode_info *inode,
-                             struct disk_reservation *disk_res,
-                             struct quota_res *quota_res,
-                             struct btree_iter *extent_iter,
-                             struct bkey_i *k,
-                             u64 new_i_size,
-                             bool may_allocate,
-                             bool direct,
-                             s64 *total_delta)
+int bch2_extent_update(struct btree_trans *trans,
+                      struct bch_inode_info *inode,
+                      struct disk_reservation *disk_res,
+                      struct quota_res *quota_res,
+                      struct btree_iter *extent_iter,
+                      struct bkey_i *k,
+                      u64 new_i_size,
+                      bool may_allocate,
+                      bool direct,
+                      s64 *total_delta)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *inode_iter = NULL;
@@ -298,13 +299,13 @@ static int bch2_extent_update(struct btree_trans *trans,
        s64 i_sectors_delta;
        int ret;
 
-       bch2_trans_begin_updates(trans);
-
        ret = bch2_btree_iter_traverse(extent_iter);
        if (ret)
                return ret;
 
-       bch2_extent_trim_atomic(k, extent_iter);
+       ret = bch2_extent_trim_atomic(k, extent_iter);
+       if (ret)
+               return ret;
 
        ret = sum_sector_overwrites(trans, extent_iter,
                                    k, &allocating,
@@ -448,6 +449,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
                bkey_copy(&tmp.k, bch2_keylist_front(keys));
 
+               bch2_trans_begin_updates(&trans);
+
                ret = bch2_extent_update(&trans, inode,
                                &wop->res, quota_res,
                                iter, &tmp.k,
@@ -511,13 +514,14 @@ struct bch_page_sector {
        /* i_sectors: */
        enum {
                SECTOR_UNALLOCATED,
-               SECTOR_QUOTA_RESERVED,
+               SECTOR_RESERVED,
                SECTOR_DIRTY,
                SECTOR_ALLOCATED,
        }                       state:2;
 };
 
 struct bch_page_state {
+       atomic_t                write_count;
        struct bch_page_sector  s[PAGE_SECTORS];
 };
 
@@ -588,31 +592,6 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
        return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
 }
 
-static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-                                     struct page *page)
-{
-       struct bch_page_state *s = bch2_page_state(page);
-       struct disk_reservation disk_res = { 0 };
-       struct quota_res quota_res = { 0 };
-       unsigned i;
-
-       if (!s)
-               return;
-
-       for (i = 0; i < ARRAY_SIZE(s->s); i++) {
-               disk_res.sectors += s->s[i].replicas_reserved;
-               s->s[i].replicas_reserved = 0;
-
-               if (s->s[i].state == SECTOR_QUOTA_RESERVED) {
-                       quota_res.sectors++;
-                       s->s[i].state = SECTOR_UNALLOCATED;
-               }
-       }
-
-       bch2_quota_reservation_put(c, inode, &quota_res);
-       bch2_disk_reservation_put(c, &disk_res);
-}
-
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 {
        /* XXX: this should not be open coded */
@@ -663,98 +642,134 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c,
        return 0;
 }
 
-static int bch2_get_page_quota_reservation(struct bch_fs *c,
+struct bch2_page_reservation {
+       struct disk_reservation disk;
+       struct quota_res        quota;
+};
+
+static void bch2_page_reservation_init(struct bch_fs *c,
                        struct bch_inode_info *inode,
-                       struct page *page, bool check_enospc)
+                       struct bch2_page_reservation *res)
+{
+       memset(res, 0, sizeof(*res));
+
+       res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+static void bch2_page_reservation_put(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct bch2_page_reservation *res)
+{
+       bch2_disk_reservation_put(c, &res->disk);
+       bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int bch2_page_reservation_get(struct bch_fs *c,
+                       struct bch_inode_info *inode, struct page *page,
+                       struct bch2_page_reservation *res,
+                       unsigned offset, unsigned len, bool check_enospc)
 {
        struct bch_page_state *s = bch2_page_state_create(page, 0);
-       struct quota_res quota_res = { 0 };
-       unsigned i, quota_res_sectors = 0;
+       unsigned i, disk_sectors = 0, quota_sectors = 0;
        int ret;
 
        if (!s)
                return -ENOMEM;
 
-       for (i = 0; i < ARRAY_SIZE(s->s); i++)
-               quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED;
-
-       if (!quota_res_sectors)
-               return 0;
+       for (i = offset / 512;
+            i < DIV_ROUND_UP(offset + len, 512);
+            i++) {
+               disk_sectors += sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
+               quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
+       }
 
-       ret = bch2_quota_reservation_add(c, inode, &quota_res,
-                                        quota_res_sectors,
-                                        check_enospc);
-       if (unlikely(ret))
-               return ret;
+       if (disk_sectors) {
+               ret = bch2_disk_reservation_add(c, &res->disk,
+                                               disk_sectors,
+                                               !check_enospc
+                                               ? BCH_DISK_RESERVATION_NOFAIL
+                                               : 0);
+               if (unlikely(ret))
+                       return ret;
+       }
 
-       for (i = 0; i < ARRAY_SIZE(s->s); i++)
-               if (s->s[i].state == SECTOR_UNALLOCATED)
-                       s->s[i].state = SECTOR_QUOTA_RESERVED;
+       if (quota_sectors) {
+               ret = bch2_quota_reservation_add(c, inode, &res->quota,
+                                                quota_sectors,
+                                                check_enospc);
+               if (unlikely(ret)) {
+                       struct disk_reservation tmp = {
+                               .sectors = disk_sectors
+                       };
+
+                       bch2_disk_reservation_put(c, &tmp);
+                       res->disk.sectors -= disk_sectors;
+                       return ret;
+               }
+       }
 
        return 0;
 }
 
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
-                                    struct page *page, bool check_enospc)
-{
-       return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?:
-               bch2_get_page_quota_reservation(c, inode, page, check_enospc);
-}
-
 static void bch2_clear_page_bits(struct page *page)
 {
        struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_page_state *s = bch2_page_state(page);
+       struct disk_reservation disk_res = { 0 };
        int i, dirty_sectors = 0;
 
        if (!s)
                return;
 
        for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+               disk_res.sectors += s->s[i].replicas_reserved;
+               s->s[i].replicas_reserved = 0;
+
                if (s->s[i].state == SECTOR_DIRTY) {
                        dirty_sectors++;
                        s->s[i].state = SECTOR_UNALLOCATED;
                }
        }
 
+       bch2_disk_reservation_put(c, &disk_res);
+
        if (dirty_sectors)
                i_sectors_acct(c, inode, NULL, -dirty_sectors);
-       bch2_put_page_reservation(c, inode, page);
 
        bch2_page_state_release(page);
 }
 
-static void __bch2_set_page_dirty(struct page *page)
+static void bch2_set_page_dirty(struct bch_fs *c,
+                       struct bch_inode_info *inode, struct page *page,
+                       struct bch2_page_reservation *res,
+                       unsigned offset, unsigned len)
 {
-       struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_page_state *s = bch2_page_state(page);
-       struct quota_res quota_res = { 0 };
        unsigned i, dirty_sectors = 0;
 
-       BUG_ON(!s);
+       for (i = offset / 512;
+            i < DIV_ROUND_UP(offset + len, 512);
+            i++) {
+               unsigned sectors = sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
 
-       for (i = 0; i < ARRAY_SIZE(s->s); i++) {
-               if (s->s[i].state == SECTOR_QUOTA_RESERVED)
-                       quota_res.sectors++;
+               BUG_ON(sectors > res->disk.sectors);
+               s->s[i].replicas_reserved += sectors;
+               res->disk.sectors -= sectors;
 
-               if (s->s[i].state == SECTOR_UNALLOCATED ||
-                   s->s[i].state == SECTOR_QUOTA_RESERVED) {
-                       s->s[i].state = SECTOR_DIRTY;
+               if (s->s[i].state == SECTOR_UNALLOCATED)
                        dirty_sectors++;
-               }
+
+               s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
        }
 
        if (dirty_sectors)
-               i_sectors_acct(c, inode, &quota_res, dirty_sectors);
-       bch2_quota_reservation_put(c, inode, &quota_res);
-}
+               i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
-static void bch2_set_page_dirty(struct page *page)
-{
-       __bch2_set_page_dirty(page);
-       __set_page_dirty_nobuffers(page);
+       if (!PageDirty(page))
+               __set_page_dirty_nobuffers(page);
 }
 
 vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
@@ -764,8 +779,11 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
        struct bch_inode_info *inode = file_bch_inode(file);
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_page_reservation res;
        int ret = VM_FAULT_LOCKED;
 
+       bch2_page_reservation_init(c, inode, &res);
+
        sb_start_pagefault(inode->v.i_sb);
        file_update_time(file);
 
@@ -786,19 +804,22 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
                goto out;
        }
 
-       if (bch2_get_page_reservation(c, inode, page, true)) {
+       if (bch2_page_reservation_get(c, inode, page, &res,
+                                     0, PAGE_SIZE, true)) {
                unlock_page(page);
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
 
-       if (!PageDirty(page))
-               bch2_set_page_dirty(page);
+       bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE);
        wait_for_stable_page(page);
 out:
        if (current->pagecache_lock != &mapping->add_lock)
                pagecache_add_put(&mapping->add_lock);
        sb_end_pagefault(inode->v.i_sb);
+
+       bch2_page_reservation_put(c, inode, &res);
+
        return ret;
 }
 
@@ -857,31 +878,6 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 }
 #endif
 
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
-       sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
-       return bio->bi_vcnt < bio->bi_max_vecs &&
-               bio_end_sector(bio) == offset;
-}
-
-static int bio_add_page_contig(struct bio *bio, struct page *page)
-{
-       sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
-       EBUG_ON(!bio->bi_max_vecs);
-
-       if (!bio->bi_vcnt)
-               bio->bi_iter.bi_sector = offset;
-       else if (!bio_can_add_page_contig(bio, page))
-               return -1;
-
-       BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
-       return 0;
-}
-
 /* readpage(s): */
 
 static void bch2_readpages_end_io(struct bio *bio)
@@ -991,11 +987,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
-       unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
-
-       BUG_ON(bio->bi_iter.bi_sector   < bkey_start_offset(k.k));
-       BUG_ON(bio_end_sector(bio)      > k.k->p.offset);
-
+       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+               ? 0 : bch2_bkey_nr_ptrs_allocated(k);
+       unsigned state = k.k->type == KEY_TYPE_reservation
+               ? SECTOR_RESERVED
+               : SECTOR_ALLOCATED;
 
        bio_for_each_segment(bv, bio, iter) {
                struct bch_page_state *s = bch2_page_state(bv.bv_page);
@@ -1005,16 +1001,17 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
                     i < (bv.bv_offset + bv.bv_len) >> 9;
                     i++) {
                        s->s[i].nr_replicas = nr_ptrs;
-                       s->s[i].state = SECTOR_ALLOCATED;
+                       s->s[i].state = state;
                }
        }
 }
 
 static void readpage_bio_extend(struct readpages_iter *iter,
-                               struct bio *bio, u64 offset,
+                               struct bio *bio,
+                               unsigned sectors_this_extent,
                                bool get_more)
 {
-       while (bio_end_sector(bio) < offset &&
+       while (bio_sectors(bio) < sectors_this_extent &&
               bio->bi_vcnt < bio->bi_max_vecs) {
                pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
                struct page *page = readpage_iter_next(iter);
@@ -1062,71 +1059,82 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
-       struct bio *bio = &rbio->bio;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
+       int ret = 0;
 
        rbio->c = c;
        rbio->start_time = local_clock();
-
+retry:
        while (1) {
                BKEY_PADDED(k) tmp;
                struct bkey_s_c k;
-               unsigned bytes;
+               unsigned bytes, sectors, offset_into_extent;
 
-               bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+               bch2_btree_iter_set_pos(iter,
+                               POS(inum, rbio->bio.bi_iter.bi_sector));
 
                k = bch2_btree_iter_peek_slot(iter);
-               BUG_ON(!k.k);
-
-               if (IS_ERR(k.k)) {
-                       int ret = btree_iter_err(iter);
-                       BUG_ON(!ret);
-                       bcache_io_error(c, bio, "btree IO error %i", ret);
-                       bio_endio(bio);
-                       return;
-               }
+               ret = bkey_err(k);
+               if (ret)
+                       break;
 
                bkey_reassemble(&tmp.k, k);
-               bch2_trans_unlock(trans);
                k = bkey_i_to_s_c(&tmp.k);
 
+               offset_into_extent = iter->pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               ret = bch2_read_indirect_extent(trans, iter,
+                                       &offset_into_extent, &tmp.k);
+               if (ret)
+                       break;
+
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               bch2_trans_unlock(trans);
+
                if (readpages_iter) {
                        bool want_full_extent = false;
 
                        if (bkey_extent_is_data(k.k)) {
-                               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+                               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                                const union bch_extent_entry *i;
                                struct extent_ptr_decoded p;
 
-                               extent_for_each_ptr_decode(e, p, i)
+                               bkey_for_each_ptr_decode(k.k, ptrs, p, i)
                                        want_full_extent |= ((p.crc.csum_type != 0) |
                                                             (p.crc.compression_type != 0));
                        }
 
-                       readpage_bio_extend(readpages_iter,
-                                           bio, k.k->p.offset,
-                                           want_full_extent);
+                       readpage_bio_extend(readpages_iter, &rbio->bio,
+                                           sectors, want_full_extent);
                }
 
-               bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
-                        bio->bi_iter.bi_sector) << 9;
-               swap(bio->bi_iter.bi_size, bytes);
+               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+               swap(rbio->bio.bi_iter.bi_size, bytes);
 
-               if (bytes == bio->bi_iter.bi_size)
+               if (rbio->bio.bi_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
                if (bkey_extent_is_allocation(k.k))
-                       bch2_add_page_sectors(bio, k);
+                       bch2_add_page_sectors(&rbio->bio, k);
 
-               bch2_read_extent(c, rbio, k, flags);
+               bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
                        return;
 
-               swap(bio->bi_iter.bi_size, bytes);
-               bio_advance(bio, bytes);
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+               bio_advance(&rbio->bio, bytes);
        }
+
+       if (ret == -EINTR)
+               goto retry;
+
+       bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+       bio_endio(&rbio->bio);
 }
 
 int bch2_readpages(struct file *file, struct address_space *mapping,
@@ -1191,7 +1199,9 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
        bch2_page_state_create(page, __GFP_NOFAIL);
 
        bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
-       bio_add_page_contig(&rbio->bio, page);
+       rbio->bio.bi_iter.bi_sector =
+               (sector_t) page->index << PAGE_SECTOR_SHIFT;
+       BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
        bch2_trans_init(&trans, c, 0, 0);
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
@@ -1277,12 +1287,20 @@ static void bch2_writepage_io_done(struct closure *cl)
        struct bio *bio = &io->op.op.wbio.bio;
        struct bvec_iter_all iter;
        struct bio_vec *bvec;
-       unsigned i;
+       unsigned i, j;
 
        if (io->op.op.error) {
                bio_for_each_segment_all(bvec, bio, i, iter) {
+                       struct bch_page_state *s;
+
                        SetPageError(bvec->bv_page);
                        mapping_set_error(bvec->bv_page->mapping, -EIO);
+
+                       lock_page(bvec->bv_page);
+                       s = bch2_page_state(bvec->bv_page);
+                       for (j = 0; j < PAGE_SECTORS; j++)
+                               s->s[j].nr_replicas = 0;
+                       unlock_page(bvec->bv_page);
                }
        }
 
@@ -1307,8 +1325,12 @@ static void bch2_writepage_io_done(struct closure *cl)
                i_sectors_acct(c, io->op.inode, NULL,
                               io->op.sectors_added - (s64) io->new_sectors);
 
-       bio_for_each_segment_all(bvec, bio, i, iter)
-               end_page_writeback(bvec->bv_page);
+       bio_for_each_segment_all(bvec, bio, i, iter) {
+               struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+
+               if (atomic_dec_and_test(&s->write_count))
+                       end_page_writeback(bvec->bv_page);
+       }
 
        closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
 }
@@ -1329,11 +1351,10 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 static void bch2_writepage_io_alloc(struct bch_fs *c,
                                    struct bch_writepage_state *w,
                                    struct bch_inode_info *inode,
-                                   struct page *page,
+                                   u64 sector,
                                    unsigned nr_replicas)
 {
        struct bch_write_op *op;
-       u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
 
        w->io = container_of(bio_alloc_bioset(GFP_NOFS,
                                              BIO_MAX_PAGES,
@@ -1347,8 +1368,8 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
        op->nr_replicas         = nr_replicas;
        op->res.nr_replicas     = nr_replicas;
        op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
-       op->pos                 = POS(inode->v.i_ino, offset);
-       op->wbio.bio.bi_iter.bi_sector = offset;
+       op->pos                 = POS(inode->v.i_ino, sector);
+       op->wbio.bio.bi_iter.bi_sector = sector;
 }
 
 static int __bch2_writepage(struct page *page,
@@ -1358,12 +1379,10 @@ static int __bch2_writepage(struct page *page,
        struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_writepage_state *w = data;
-       struct bch_page_state *s;
-       unsigned offset, nr_replicas_this_write = U32_MAX;
-       unsigned dirty_sectors = 0, reserved_sectors = 0;
+       struct bch_page_state *s, orig;
+       unsigned i, offset, nr_replicas_this_write = U32_MAX;
        loff_t i_size = i_size_read(&inode->v);
        pgoff_t end_index = i_size >> PAGE_SHIFT;
-       unsigned i;
        int ret;
 
        EBUG_ON(!PageUptodate(page));
@@ -1398,48 +1417,90 @@ do_io:
                return 0;
        }
 
-       for (i = 0; i < PAGE_SECTORS; i++)
+       /* Before unlocking the page, get copy of reservations: */
+       orig = *s;
+
+       for (i = 0; i < PAGE_SECTORS; i++) {
+               if (s->s[i].state < SECTOR_DIRTY)
+                       continue;
+
                nr_replicas_this_write =
                        min_t(unsigned, nr_replicas_this_write,
                              s->s[i].nr_replicas +
                              s->s[i].replicas_reserved);
-
-       /* Before unlocking the page, transfer reservation to w->io: */
+       }
 
        for (i = 0; i < PAGE_SECTORS; i++) {
+               if (s->s[i].state < SECTOR_DIRTY)
+                       continue;
+
                s->s[i].nr_replicas = w->opts.compression
                        ? 0 : nr_replicas_this_write;
 
-               reserved_sectors += s->s[i].replicas_reserved;
                s->s[i].replicas_reserved = 0;
-
-               dirty_sectors += s->s[i].state == SECTOR_DIRTY;
                s->s[i].state = SECTOR_ALLOCATED;
        }
 
+       BUG_ON(atomic_read(&s->write_count));
+       atomic_set(&s->write_count, 1);
+
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
+
        unlock_page(page);
 
-       if (w->io &&
-           (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
-            !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
-               bch2_writepage_do_io(w);
+       offset = 0;
+       while (1) {
+               unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+               u64 sector;
+
+               while (offset < PAGE_SECTORS &&
+                      orig.s[offset].state < SECTOR_DIRTY)
+                       offset++;
+
+               if (offset == PAGE_SECTORS)
+                       break;
+
+               sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+
+               while (offset + sectors < PAGE_SECTORS &&
+                      orig.s[offset + sectors].state >= SECTOR_DIRTY)
+                       sectors++;
+
+               for (i = offset; i < offset + sectors; i++) {
+                       reserved_sectors += orig.s[i].replicas_reserved;
+                       dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
+               }
 
-       if (!w->io)
-               bch2_writepage_io_alloc(c, w, inode, page,
-                                       nr_replicas_this_write);
+               if (w->io &&
+                   (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
+                    bio_full(&w->io->op.op.wbio.bio) ||
+                    bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+                       bch2_writepage_do_io(w);
 
-       w->io->new_sectors += dirty_sectors;
+               if (!w->io)
+                       bch2_writepage_io_alloc(c, w, inode, sector,
+                                               nr_replicas_this_write);
 
-       BUG_ON(inode != w->io->op.inode);
-       BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+               w->io->new_sectors += dirty_sectors;
 
-       w->io->op.op.res.sectors += reserved_sectors;
-       w->io->op.new_i_size = i_size;
+               atomic_inc(&s->write_count);
 
-       if (wbc->sync_mode == WB_SYNC_ALL)
-               w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+               BUG_ON(inode != w->io->op.inode);
+               BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+                                    sectors << 9, offset << 9));
+
+               w->io->op.op.res.sectors += reserved_sectors;
+               w->io->op.new_i_size = i_size;
+
+               if (wbc->sync_mode == WB_SYNC_ALL)
+                       w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+               offset += sectors;
+       }
+
+       if (atomic_dec_and_test(&s->write_count))
+               end_page_writeback(page);
 
        return 0;
 }
@@ -1482,12 +1543,18 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct bch_inode_info *inode = to_bch_ei(mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_page_reservation *res;
        pgoff_t index = pos >> PAGE_SHIFT;
        unsigned offset = pos & (PAGE_SIZE - 1);
        struct page *page;
        int ret = -ENOMEM;
 
-       BUG_ON(inode_unhashed(&inode->v));
+       res = kmalloc(sizeof(*res), GFP_KERNEL);
+       if (!res)
+               return -ENOMEM;
+
+       bch2_page_reservation_init(c, inode, res);
+       *fsdata = res;
 
        /* Not strictly necessary - same reason as mkwrite(): */
        pagecache_add_get(&mapping->add_lock);
@@ -1519,7 +1586,8 @@ readpage:
        if (ret)
                goto err;
 out:
-       ret = bch2_get_page_reservation(c, inode, page, true);
+       ret = bch2_page_reservation_get(c, inode, page, res,
+                                       offset, len, true);
        if (ret) {
                if (!PageUptodate(page)) {
                        /*
@@ -1542,6 +1610,8 @@ err:
        *pagep = NULL;
 err_unlock:
        pagecache_add_put(&mapping->add_lock);
+       kfree(res);
+       *fsdata = NULL;
        return ret;
 }
 
@@ -1551,6 +1621,8 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 {
        struct bch_inode_info *inode = to_bch_ei(mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_page_reservation *res = fsdata;
+       unsigned offset = pos & (PAGE_SIZE - 1);
 
        lockdep_assert_held(&inode->v.i_rwsem);
 
@@ -1573,18 +1645,19 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
        if (copied) {
                if (!PageUptodate(page))
                        SetPageUptodate(page);
-               if (!PageDirty(page))
-                       bch2_set_page_dirty(page);
+
+               bch2_set_page_dirty(c, inode, page, res, offset, copied);
 
                inode->ei_last_dirtied = (unsigned long) current;
-       } else {
-               bch2_put_page_reservation(c, inode, page);
        }
 
        unlock_page(page);
        put_page(page);
        pagecache_add_put(&mapping->add_lock);
 
+       bch2_page_reservation_put(c, inode, res);
+       kfree(res);
+
        return copied;
 }
 
@@ -1597,15 +1670,19 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct page *pages[WRITE_BATCH_PAGES];
+       struct bch2_page_reservation res;
        unsigned long index = pos >> PAGE_SHIFT;
        unsigned offset = pos & (PAGE_SIZE - 1);
        unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
-       unsigned i, copied = 0, nr_pages_copied = 0;
+       unsigned i, reserved = 0, set_dirty = 0;
+       unsigned copied = 0, nr_pages_copied = 0;
        int ret = 0;
 
        BUG_ON(!len);
        BUG_ON(nr_pages > ARRAY_SIZE(pages));
 
+       bch2_page_reservation_init(c, inode, &res);
+
        for (i = 0; i < nr_pages; i++) {
                pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
                if (!pages[i]) {
@@ -1632,19 +1709,25 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                }
        }
 
-       for (i = 0; i < nr_pages; i++) {
-               ret = bch2_get_page_reservation(c, inode, pages[i], true);
-
-               if (ret && !PageUptodate(pages[i])) {
-                       ret = bch2_read_single_page(pages[i], mapping);
-                       if (ret)
-                               goto out;
-
-                       ret = bch2_get_page_reservation(c, inode, pages[i], true);
+       while (reserved < len) {
+               struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+               unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
+               unsigned pg_len = min_t(unsigned, len - reserved,
+                                       PAGE_SIZE - pg_offset);
+retry_reservation:
+               ret = bch2_page_reservation_get(c, inode, page, &res,
+                                               pg_offset, pg_len, true);
+
+               if (ret && !PageUptodate(page)) {
+                       ret = bch2_read_single_page(page, mapping);
+                       if (!ret)
+                               goto retry_reservation;
                }
 
                if (ret)
                        goto out;
+
+               reserved += pg_len;
        }
 
        if (mapping_writably_mapped(mapping))
@@ -1654,10 +1737,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        while (copied < len) {
                struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
                unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
-               unsigned pg_bytes = min_t(unsigned, len - copied,
-                                         PAGE_SIZE - pg_offset);
+               unsigned pg_len = min_t(unsigned, len - copied,
+                                       PAGE_SIZE - pg_offset);
                unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
-                                               iter, pg_offset, pg_bytes);
+                                               iter, pg_offset, pg_len);
 
                if (!pg_copied)
                        break;
@@ -1687,23 +1770,30 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
                        copied -= (offset + copied) & (PAGE_SIZE - 1);
                }
        }
-out:
-       for (i = 0; i < nr_pages_copied; i++) {
-               if (!PageUptodate(pages[i]))
-                       SetPageUptodate(pages[i]);
-               if (!PageDirty(pages[i]))
-                       bch2_set_page_dirty(pages[i]);
-               unlock_page(pages[i]);
-               put_page(pages[i]);
-       }
 
+       while (set_dirty < copied) {
+               struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
+               unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+               unsigned pg_len = min_t(unsigned, copied - set_dirty,
+                                       PAGE_SIZE - pg_offset);
+
+               if (!PageUptodate(page))
+                       SetPageUptodate(page);
+
+               bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
+               unlock_page(page);
+               put_page(page);
+
+               set_dirty += pg_len;
+       }
+out:
        for (i = nr_pages_copied; i < nr_pages; i++) {
-               if (!PageDirty(pages[i]))
-                       bch2_put_page_reservation(c, inode, pages[i]);
                unlock_page(pages[i]);
                put_page(pages[i]);
        }
 
+       bch2_page_reservation_put(c, inode, &res);
+
        return copied ?: ret;
 }
 
@@ -2186,29 +2276,25 @@ out:
 
 /* truncate: */
 
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
-                        u64 start_offset, u64 end_offset, u64 *journal_seq)
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+                  struct bpos end, struct bch_inode_info *inode,
+                  u64 new_i_size)
 {
-       struct bpos start       = POS(inode->v.i_ino, start_offset);
-       struct bpos end         = POS(inode->v.i_ino, end_offset);
+       struct bch_fs *c        = trans->c;
        unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
-       struct btree_trans trans;
-       struct btree_iter *iter;
        struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
-                                  BTREE_ITER_INTENT);
+       int ret = 0, ret2 = 0;
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k)) &&
               bkey_cmp(iter->pos, end) < 0) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
 
+               ret = bkey_err(k);
+               if (ret)
+                       goto btree_err;
+
                bkey_init(&delete.k);
                delete.k.p = iter->pos;
 
@@ -2216,21 +2302,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
                bch2_key_resize(&delete.k, max_sectors);
                bch2_cut_back(end, &delete.k);
 
-               ret = bch2_extent_update(&trans, inode,
+               bch2_trans_begin_updates(trans);
+
+               ret = bch2_extent_update(trans, inode,
                                &disk_res, NULL, iter, &delete,
-                               0, true, true, NULL);
+                               new_i_size, false, true, NULL);
                bch2_disk_reservation_put(c, &disk_res);
-
-               if (ret == -EINTR)
+btree_err:
+               if (ret == -EINTR) {
+                       ret2 = ret;
                        ret = 0;
+               }
                if (ret)
                        break;
+       }
 
-               bch2_trans_cond_resched(&trans);
+       if (bkey_cmp(iter->pos, end) > 0) {
+               bch2_btree_iter_set_pos(iter, end);
+               ret = bch2_btree_iter_traverse(iter);
        }
 
+       return ret ?: ret2;
+}
+
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+                        u64 start_offset, u64 end_offset)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                                  POS(inode->v.i_ino, start_offset),
+                                  BTREE_ITER_INTENT);
+
+       ret = bch2_fpunch_at(&trans, iter,
+                            POS(inode->v.i_ino, end_offset),
+                            inode, 0);
+
        bch2_trans_exit(&trans);
 
+       if (ret == -EINTR)
+               ret = 0;
+
        return ret;
 }
 
@@ -2263,8 +2379,10 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
+       struct bch_page_state *s;
        unsigned start_offset = start & (PAGE_SIZE - 1);
        unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+       unsigned i;
        struct page *page;
        int ret = 0;
 
@@ -2296,31 +2414,42 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
                }
        }
 
+       s = bch2_page_state_create(page, 0);
+       if (!s) {
+               ret = -ENOMEM;
+               goto unlock;
+       }
+
        if (!PageUptodate(page)) {
                ret = bch2_read_single_page(page, mapping);
                if (ret)
                        goto unlock;
        }
 
+       if (index != start >> PAGE_SHIFT)
+               start_offset = 0;
+       if (index != end >> PAGE_SHIFT)
+               end_offset = PAGE_SIZE;
+
+       for (i = round_up(start_offset, block_bytes(c)) >> 9;
+            i < round_down(end_offset, block_bytes(c)) >> 9;
+            i++) {
+               s->s[i].nr_replicas     = 0;
+               s->s[i].state           = SECTOR_UNALLOCATED;
+       }
+
+       zero_user_segment(page, start_offset, end_offset);
+
        /*
         * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
         *
         * XXX: because we aren't currently tracking whether the page has actual
         * data in it (vs. just 0s, or only partially written) this wrong. ick.
         */
-       ret = bch2_get_page_reservation(c, inode, page, false);
+       ret = bch2_get_page_disk_reservation(c, inode, page, false);
        BUG_ON(ret);
 
-       if (index == start >> PAGE_SHIFT &&
-           index == end >> PAGE_SHIFT)
-               zero_user_segment(page, start_offset, end_offset);
-       else if (index == start >> PAGE_SHIFT)
-               zero_user_segment(page, start_offset, PAGE_SIZE);
-       else if (index == end >> PAGE_SHIFT)
-               zero_user_segment(page, 0, end_offset);
-
-       if (!PageDirty(page))
-               bch2_set_page_dirty(page);
+       __set_page_dirty_nobuffers(page);
 unlock:
        unlock_page(page);
        put_page(page);
@@ -2331,7 +2460,7 @@ out:
 static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 {
        return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
-                                   from, from + PAGE_SIZE);
+                                   from, round_up(from, PAGE_SIZE));
 }
 
 static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2422,13 +2551,9 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
        truncate_setsize(&inode->v, iattr->ia_size);
 
-       /*
-        * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
-        * here:
-        */
        ret = __bch2_fpunch(c, inode,
-                       round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-                       U64_MAX, &inode->ei_journal_seq);
+                       round_up(iattr->ia_size, block_bytes(c)) >> 9,
+                       U64_MAX);
        if (unlikely(ret))
                goto err;
 
@@ -2449,8 +2574,8 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
-       u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+       u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
+       u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
        int ret = 0;
 
        inode_lock(&inode->v);
@@ -2475,8 +2600,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
        truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
        if (discard_start < discard_end)
-               ret = __bch2_fpunch(c, inode, discard_start, discard_end,
-                                   &inode->ei_journal_seq);
+               ret = __bch2_fpunch(c, inode, discard_start, discard_end);
 err:
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
@@ -2535,7 +2659,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
 
        while (bkey_cmp(dst->pos,
                        POS(inode->v.i_ino,
-                           round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+                           round_up(new_size, block_bytes(c)) >> 9)) < 0) {
                struct disk_reservation disk_res;
 
                ret = bch2_btree_iter_traverse(dst);
@@ -2554,7 +2678,9 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                bch2_cut_front(src->pos, &copy.k);
                copy.k.k.p.offset -= len >> 9;
 
-               bch2_extent_trim_atomic(&copy.k, dst);
+               ret = bch2_extent_trim_atomic(&copy.k, dst);
+               if (ret)
+                       goto bkey_err;
 
                BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(&copy.k.k)));
 
@@ -2563,6 +2689,8 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                                BCH_DISK_RESERVATION_NOFAIL);
                BUG_ON(ret);
 
+               bch2_trans_begin_updates(&trans);
+
                ret = bch2_extent_update(&trans, inode,
                                &disk_res, NULL,
                                dst, &copy.k,
@@ -2584,7 +2712,7 @@ bkey_err:
 
        ret = __bch2_fpunch(c, inode,
                        round_up(new_size, block_bytes(c)) >> 9,
-                       U64_MAX, &inode->ei_journal_seq);
+                       U64_MAX);
        if (ret)
                goto err;
 
@@ -2608,8 +2736,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bpos end_pos;
-       loff_t block_start, block_end;
-       loff_t end = offset + len;
+       loff_t end              = offset + len;
+       loff_t block_start      = round_down(offset,    block_bytes(c));
+       loff_t block_end        = round_up(end,         block_bytes(c));
        unsigned sectors;
        unsigned replicas = io_opts(c, inode).data_replicas;
        int ret;
@@ -2641,12 +2770,6 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                        goto err;
 
                truncate_pagecache_range(&inode->v, offset, end - 1);
-
-               block_start     = round_up(offset, PAGE_SIZE);
-               block_end       = round_down(end, PAGE_SIZE);
-       } else {
-               block_start     = round_down(offset, PAGE_SIZE);
-               block_end       = round_up(end, PAGE_SIZE);
        }
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -2706,6 +2829,8 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                        reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
+               bch2_trans_begin_updates(&trans);
+
                ret = bch2_extent_update(&trans, inode,
                                &disk_res, &quota_res,
                                iter, &reservation.k_i,
@@ -2770,42 +2895,148 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        return -EOPNOTSUPP;
 }
 
+static void mark_range_unallocated(struct bch_inode_info *inode,
+                                  loff_t start, loff_t end)
+{
+       pgoff_t index = start >> PAGE_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
+       struct pagevec pvec;
+
+       pagevec_init(&pvec);
+
+       do {
+               unsigned nr_pages, i, j;
+
+               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+                                               &index, end_index);
+               if (nr_pages == 0)
+                       break;
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       struct bch_page_state *s;
+
+                       lock_page(page);
+                       s = bch2_page_state(page);
+
+                       if (s)
+                               for (j = 0; j < PAGE_SECTORS; j++)
+                                       s->s[j].nr_replicas = 0;
+
+                       unlock_page(page);
+               }
+               pagevec_release(&pvec);
+       } while (index <= end_index);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+                            struct file *file_dst, loff_t pos_dst,
+                            loff_t len, unsigned remap_flags)
+{
+       struct bch_inode_info *src = file_bch_inode(file_src);
+       struct bch_inode_info *dst = file_bch_inode(file_dst);
+       struct bch_fs *c = src->v.i_sb->s_fs_info;
+       loff_t ret = 0;
+       loff_t aligned_len;
+
+       if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+               return -EINVAL;
+
+       if (remap_flags & REMAP_FILE_DEDUP)
+               return -EOPNOTSUPP;
+
+       if ((pos_src & (block_bytes(c) - 1)) ||
+           (pos_dst & (block_bytes(c) - 1)))
+               return -EINVAL;
+
+       if (src == dst &&
+           abs(pos_src - pos_dst) < len)
+               return -EINVAL;
+
+       bch2_lock_inodes(INODE_LOCK, src, dst);
+
+       inode_dio_wait(&src->v);
+       inode_dio_wait(&dst->v);
+
+       __pagecache_block_get(&src->v.i_mapping->add_lock);
+       __pagecache_block_get(&dst->v.i_mapping->add_lock);
+
+       ret = generic_remap_file_range_prep(file_src, pos_src,
+                                           file_dst, pos_dst,
+                                           &len, remap_flags);
+       if (ret < 0 || len == 0)
+               goto out_unlock;
+
+       aligned_len = round_up(len, block_bytes(c));
+
+       ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+                               pos_dst, pos_dst + aligned_len);
+       if (ret)
+               goto out_unlock;
+
+       mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+
+       ret = bch2_remap_range(c, dst,
+                              POS(dst->v.i_ino, pos_dst >> 9),
+                              POS(src->v.i_ino, pos_src >> 9),
+                              aligned_len >> 9,
+                              pos_dst + len);
+       if (ret > 0)
+               ret = min(ret << 9, len);
+
+out_unlock:
+       __pagecache_block_put(&dst->v.i_mapping->add_lock);
+       __pagecache_block_put(&src->v.i_mapping->add_lock);
+
+       bch2_unlock_inodes(INODE_LOCK, src, dst);
+
+       return ret;
+}
+
 /* fseek: */
 
-static bool page_is_data(struct page *page)
+static int page_data_offset(struct page *page, unsigned offset)
 {
        struct bch_page_state *s = bch2_page_state(page);
        unsigned i;
 
-       if (!s)
-               return false;
-
-       for (i = 0; i < PAGE_SECTORS; i++)
-               if (s->s[i].state >= SECTOR_DIRTY)
-                       return true;
+       if (s)
+               for (i = offset >> 9; i < PAGE_SECTORS; i++)
+                       if (s->s[i].state >= SECTOR_DIRTY)
+                               return i << 9;
 
-       return false;
+       return -1;
 }
 
-static loff_t bch2_next_pagecache_data(struct inode *vinode,
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
                                       loff_t start_offset,
                                       loff_t end_offset)
 {
        struct address_space *mapping = vinode->i_mapping;
        struct page *page;
-       pgoff_t index;
-
-       for (index = start_offset >> PAGE_SHIFT;
-            index < end_offset >> PAGE_SHIFT;
-            index++) {
-               if (find_get_pages(mapping, &index, 1, &page)) {
+       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
+       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
+       pgoff_t index           = start_index;
+       loff_t ret;
+       int offset;
+
+       while (index <= end_index) {
+               if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
                        lock_page(page);
 
-                       if (page_is_data(page))
-                               end_offset =
-                                       min(end_offset,
-                                       max(start_offset,
-                                           ((loff_t) index) << PAGE_SHIFT));
+                       offset = page_data_offset(page,
+                                       page->index == start_index
+                                       ? start_offset & (PAGE_SIZE - 1)
+                                       : 0);
+                       if (offset >= 0) {
+                               ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+                                           offset,
+                                           start_offset, end_offset);
+                               unlock_page(page);
+                               put_page(page);
+                               return ret;
+                       }
+
                        unlock_page(page);
                        put_page(page);
                } else {
@@ -2848,43 +3079,65 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                return ret;
 
        if (next_data > offset)
-               next_data = bch2_next_pagecache_data(&inode->v,
+               next_data = bch2_seek_pagecache_data(&inode->v,
                                                     offset, next_data);
 
-       if (next_data > isize)
+       if (next_data >= isize)
                return -ENXIO;
 
        return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+static int __page_hole_offset(struct page *page, unsigned offset)
 {
+       struct bch_page_state *s = bch2_page_state(page);
+       unsigned i;
+
+       if (!s)
+               return 0;
+
+       for (i = offset >> 9; i < PAGE_SECTORS; i++)
+               if (s->s[i].state < SECTOR_DIRTY)
+                       return i << 9;
+
+       return -1;
+}
+
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+{
+       pgoff_t index = offset >> PAGE_SHIFT;
        struct page *page;
-       bool ret;
+       int pg_offset;
+       loff_t ret = -1;
 
        page = find_lock_entry(mapping, index);
        if (!page || xa_is_value(page))
-               return false;
+               return offset;
+
+       pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
+       if (pg_offset >= 0)
+               ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
 
-       ret = page_is_data(page);
        unlock_page(page);
 
        return ret;
 }
 
-static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
                                       loff_t start_offset,
                                       loff_t end_offset)
 {
        struct address_space *mapping = vinode->i_mapping;
-       pgoff_t index;
+       loff_t offset = start_offset, hole;
 
-       for (index = start_offset >> PAGE_SHIFT;
-            index < end_offset >> PAGE_SHIFT;
-            index++)
-               if (!page_slot_is_data(mapping, index))
-                       end_offset = max(start_offset,
-                                        ((loff_t) index) << PAGE_SHIFT);
+       while (offset < end_offset) {
+               hole = page_hole_offset(mapping, offset);
+               if (hole >= 0 && hole <= end_offset)
+                       return max(start_offset, hole);
+
+               offset += PAGE_SIZE;
+               offset &= PAGE_MASK;
+       }
 
        return end_offset;
 }
@@ -2909,11 +3162,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                           POS(inode->v.i_ino, offset >> 9),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
-                       next_hole = bch2_next_pagecache_hole(&inode->v,
+                       next_hole = bch2_seek_pagecache_hole(&inode->v,
                                        offset, MAX_LFS_FILESIZE);
                        break;
                } else if (!bkey_extent_is_data(k.k)) {
-                       next_hole = bch2_next_pagecache_hole(&inode->v,
+                       next_hole = bch2_seek_pagecache_hole(&inode->v,
                                        max(offset, bkey_start_offset(k.k) << 9),
                                        k.k->p.offset << 9);
 
index 2b3ac496dc3e04a396ca50dd92852178a8783789..a35732327e9178f62054db2ff937896af0df3e54 100644 (file)
@@ -9,6 +9,22 @@
 
 #include <linux/uio.h>
 
+struct quota_res;
+
+int bch2_extent_update(struct btree_trans *,
+                      struct bch_inode_info *,
+                      struct disk_reservation *,
+                      struct quota_res *,
+                      struct btree_iter *,
+                      struct bkey_i *,
+                      u64, bool, bool, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+                  struct bpos, struct bch_inode_info *, u64);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+                                      struct bch_inode_info *,
+                                      loff_t, unsigned);
+
 int bch2_writepage(struct page *, struct writeback_control *);
 int bch2_readpage(struct file *, struct page *);
 
@@ -30,6 +46,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
 int bch2_truncate(struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+                            loff_t, loff_t, unsigned);
+
 loff_t bch2_llseek(struct file *, loff_t, int);
 
 vm_fault_t bch2_page_mkwrite(struct vm_fault *);
index b1d23e3f7a31a6e7a879a249939d84d417ad912f..a35f34eb293c7b3dd02692f9c5c6d49869553d2f 100644 (file)
@@ -1068,16 +1068,20 @@ static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
        return 0;
 }
 
-static int bch2_fill_extent(struct fiemap_extent_info *info,
-                           const struct bkey_i *k, unsigned flags)
+static int bch2_fill_extent(struct bch_fs *c,
+                           struct fiemap_extent_info *info,
+                           struct bkey_s_c k, unsigned flags)
 {
-       if (bkey_extent_is_data(&k->k)) {
-               struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+       if (bkey_extent_is_data(k.k)) {
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
                int ret;
 
-               extent_for_each_ptr_decode(e, p, entry) {
+               if (k.k->type == KEY_TYPE_reflink_v)
+                       flags |= FIEMAP_EXTENT_SHARED;
+
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                        int flags2 = 0;
                        u64 offset = p.ptr.offset;
 
@@ -1086,23 +1090,23 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
                        else
                                offset += p.crc.offset;
 
-                       if ((offset & (PAGE_SECTORS - 1)) ||
-                           (e.k->size & (PAGE_SECTORS - 1)))
+                       if ((offset & (c->opts.block_size - 1)) ||
+                           (k.k->size & (c->opts.block_size - 1)))
                                flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
                        ret = fiemap_fill_next_extent(info,
-                                               bkey_start_offset(e.k) << 9,
+                                               bkey_start_offset(k.k) << 9,
                                                offset << 9,
-                                               e.k->size << 9, flags|flags2);
+                                               k.k->size << 9, flags|flags2);
                        if (ret)
                                return ret;
                }
 
                return 0;
-       } else if (k->k.type == KEY_TYPE_reservation) {
+       } else if (k.k->type == KEY_TYPE_reservation) {
                return fiemap_fill_next_extent(info,
-                                              bkey_start_offset(&k->k) << 9,
-                                              0, k->k.size << 9,
+                                              bkey_start_offset(k.k) << 9,
+                                              0, k.k->size << 9,
                                               flags|
                                               FIEMAP_EXTENT_DELALLOC|
                                               FIEMAP_EXTENT_UNWRITTEN);
@@ -1119,7 +1123,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       BKEY_PADDED(k) tmp;
+       BKEY_PADDED(k) cur, prev;
+       unsigned offset_into_extent, sectors;
        bool have_extent = false;
        int ret = 0;
 
@@ -1128,27 +1133,58 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-                          POS(ei->v.i_ino, start >> 9), 0, k, ret)
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                                  POS(ei->v.i_ino, start >> 9),
+                                  BTREE_ITER_SLOTS);
+
+       while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) {
+               k = bch2_btree_iter_peek_slot(iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               bkey_reassemble(&cur.k, k);
+               k = bkey_i_to_s_c(&cur.k);
+
+               offset_into_extent      = iter->pos.offset -
+                       bkey_start_offset(k.k);
+               sectors                 = k.k->size - offset_into_extent;
+
+               ret = bch2_read_indirect_extent(&trans, iter,
+                                       &offset_into_extent, &cur.k);
+               if (ret)
+                       break;
+
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               bch2_cut_front(POS(k.k->p.inode,
+                                  bkey_start_offset(k.k) + offset_into_extent),
+                              &cur.k);
+               bch2_key_resize(&cur.k.k, sectors);
+               cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
+
                if (bkey_extent_is_data(k.k) ||
                    k.k->type == KEY_TYPE_reservation) {
-                       if (bkey_cmp(bkey_start_pos(k.k),
-                                    POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
-                               break;
-
                        if (have_extent) {
-                               ret = bch2_fill_extent(info, &tmp.k, 0);
+                               ret = bch2_fill_extent(c, info,
+                                               bkey_i_to_s_c(&prev.k), 0);
                                if (ret)
                                        break;
                        }
 
-                       bkey_reassemble(&tmp.k, k);
+                       bkey_copy(&prev.k, &cur.k);
                        have_extent = true;
                }
 
-       if (!ret && have_extent)
-               ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+               bch2_btree_iter_set_pos(iter,
+                               POS(iter->pos.inode,
+                                   iter->pos.offset + sectors));
+       }
 
+       if (!ret && have_extent)
+               ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+                                      FIEMAP_EXTENT_LAST);
+err:
        ret = bch2_trans_exit(&trans) ?: ret;
        return ret < 0 ? ret : 0;
 }
@@ -1196,6 +1232,7 @@ static const struct file_operations bch_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = bch2_compat_fs_ioctl,
 #endif
+       .remap_file_range = bch2_remap_file_range,
 };
 
 static const struct inode_operations bch_file_inode_operations = {
@@ -1712,9 +1749,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
                goto out;
        }
 
-       /* XXX: blocksize */
-       sb->s_blocksize         = PAGE_SIZE;
-       sb->s_blocksize_bits    = PAGE_SHIFT;
+       sb->s_blocksize         = block_bytes(c);
+       sb->s_blocksize_bits    = ilog2(block_bytes(c));
        sb->s_maxbytes          = MAX_LFS_FILESIZE;
        sb->s_op                = &bch_super_operations;
        sb->s_export_op         = &bch_export_ops;
index 4d81b6e6e54f79a5510d36941d50ca112810acdd..c5d9a0c53064912c3fad05c016fc43ca194ad5c9 100644 (file)
@@ -259,6 +259,8 @@ int bch2_write_index_default(struct bch_write_op *op)
        bch2_verify_keylist_sorted(keys);
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+retry:
+       bch2_trans_begin(&trans);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
                                   bkey_start_pos(&bch2_keylist_front(keys)->k),
@@ -269,7 +271,9 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                bkey_copy(&split.k, bch2_keylist_front(keys));
 
-               bch2_extent_trim_atomic(&split.k, iter);
+               ret = bch2_extent_trim_atomic(&split.k, iter);
+               if (ret)
+                       break;
 
                bch2_trans_update(&trans,
                                  BTREE_INSERT_ENTRY(iter, &split.k));
@@ -286,6 +290,11 @@ int bch2_write_index_default(struct bch_write_op *op)
                        bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
+       if (ret == -EINTR) {
+               ret = 0;
+               goto retry;
+       }
+
        bch2_trans_exit(&trans);
 
        return ret;
@@ -426,7 +435,7 @@ static void init_append_extent(struct bch_write_op *op,
                p.ptr.cached = !ca->mi.durability ||
                        (op->flags & BCH_WRITE_CACHED) != 0;
                p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
-               bch2_extent_ptr_decoded_append(e, &p);
+               bch2_extent_ptr_decoded_append(&e->k_i, &p);
 
                BUG_ON(crc.compressed_size > ob->sectors_free);
                ob->sectors_free -= crc.compressed_size;
@@ -954,17 +963,13 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
                                  struct bch_io_opts opts,
                                  unsigned flags)
 {
-       if (!bkey_extent_is_data(k.k))
-               return false;
-
        if (!(flags & BCH_READ_MAY_PROMOTE))
                return false;
 
        if (!opts.promote_target)
                return false;
 
-       if (bch2_extent_has_target(c, bkey_s_c_to_extent(k),
-                                  opts.promote_target))
+       if (bch2_bkey_has_target(c, k, opts.promote_target))
                return false;
 
        if (bch2_target_congested(c, opts.promote_target)) {
@@ -1028,6 +1033,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 
 noinline
 static struct promote_op *__promote_alloc(struct bch_fs *c,
+                                         enum btree_id btree_id,
                                          struct bpos pos,
                                          struct extent_ptr_decoded *pick,
                                          struct bch_io_opts opts,
@@ -1084,6 +1090,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                        (struct data_opts) {
                                .target = opts.promote_target
                        },
+                       btree_id,
                        bkey_s_c_null);
        BUG_ON(ret);
 
@@ -1121,7 +1128,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
        if (!should_promote(c, k, pos, opts, flags))
                return NULL;
 
-       promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+       promote = __promote_alloc(c,
+                                 k.k->type == KEY_TYPE_reflink_v
+                                 ? BTREE_ID_REFLINK
+                                 : BTREE_ID_EXTENTS,
+                                 pos, pick, opts, sectors, rbio);
        if (!promote)
                return NULL;
 
@@ -1222,17 +1233,16 @@ retry:
        k = bkey_i_to_s_c(&tmp.k);
        bch2_trans_unlock(&trans);
 
-       if (!bkey_extent_is_data(k.k) ||
-           !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
-                                    rbio->pick.ptr,
-                                    rbio->pos.offset -
-                                    rbio->pick.crc.offset)) {
+       if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+                                  rbio->pick.ptr,
+                                  rbio->pos.offset -
+                                  rbio->pick.crc.offset)) {
                /* extent we wanted to read no longer exists: */
                rbio->hole = true;
                goto out;
        }
 
-       ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+       ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
        if (ret == READ_RETRY)
                goto retry;
        if (ret)
@@ -1255,26 +1265,40 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
+
+       bch2_trans_init(&trans, c, 0, 0);
 retry:
+       bch2_trans_begin(&trans);
+
        for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
                           POS(inode, bvec_iter.bi_sector),
                           BTREE_ITER_SLOTS, k, ret) {
                BKEY_PADDED(k) tmp;
-               unsigned bytes;
+               unsigned bytes, sectors, offset_into_extent;
 
                bkey_reassemble(&tmp.k, k);
                k = bkey_i_to_s_c(&tmp.k);
+
+               offset_into_extent = iter->pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               ret = bch2_read_indirect_extent(&trans, iter,
+                                       &offset_into_extent, &tmp.k);
+               if (ret)
+                       break;
+
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
                bch2_trans_unlock(&trans);
 
-               bytes = min_t(unsigned, bvec_iter.bi_size,
-                             (k.k->p.offset - bvec_iter.bi_sector) << 9);
+               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
                swap(bvec_iter.bi_size, bytes);
 
-               ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+               ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+                               offset_into_extent, failed, flags);
                switch (ret) {
                case READ_RETRY:
                        goto retry;
@@ -1355,7 +1379,6 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_i_extent *e;
        BKEY_PADDED(k) new;
        struct bch_extent_crc_unpacked new_crc;
        u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
@@ -1374,34 +1397,30 @@ retry:
        if (IS_ERR_OR_NULL(k.k))
                goto out;
 
-       if (!bkey_extent_is_data(k.k))
-               goto out;
-
        bkey_reassemble(&new.k, k);
-       e = bkey_i_to_extent(&new.k);
+       k = bkey_i_to_s_c(&new.k);
 
-       if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
-                                    rbio->pick.ptr, data_offset) ||
-           bversion_cmp(e->k.version, rbio->version))
+       if (bversion_cmp(k.k->version, rbio->version) ||
+           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
                goto out;
 
        /* Extent was merged? */
-       if (bkey_start_offset(&e->k) < data_offset ||
-           e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+       if (bkey_start_offset(k.k) < data_offset ||
+           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
                goto out;
 
        if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
                        rbio->pick.crc, NULL, &new_crc,
-                       bkey_start_offset(&e->k) - data_offset, e->k.size,
+                       bkey_start_offset(k.k) - data_offset, k.k->size,
                        rbio->pick.crc.csum_type)) {
                bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
                goto out;
        }
 
-       if (!bch2_extent_narrow_crcs(e, new_crc))
+       if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
                goto out;
 
-       bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
+       bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
        ret = bch2_trans_commit(&trans, NULL, NULL,
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL|
@@ -1412,15 +1431,6 @@ out:
        bch2_trans_exit(&trans);
 }
 
-static bool should_narrow_crcs(struct bkey_s_c k,
-                              struct extent_ptr_decoded *pick,
-                              unsigned flags)
-{
-       return !(flags & BCH_READ_IN_RETRY) &&
-               bkey_extent_is_data(k.k) &&
-               bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
-}
-
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
@@ -1455,7 +1465,7 @@ static void __bch2_read_endio(struct work_struct *work)
                goto nodecode;
 
        /* Adjust crc to point to subset of data we want: */
-       crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+       crc.offset     += rbio->offset_into_extent;
        crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
 
        if (crc.compression_type != BCH_COMPRESSION_NONE) {
@@ -1564,8 +1574,51 @@ static void bch2_read_endio(struct bio *bio)
        bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
+int bch2_read_indirect_extent(struct btree_trans *trans,
+                             struct btree_iter *extent_iter,
+                             unsigned *offset_into_extent,
+                             struct bkey_i *orig_k)
+{
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       u64 reflink_offset;
+       int ret;
+
+       if (orig_k->k.type != KEY_TYPE_reflink_p)
+               return 0;
+
+       reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+               *offset_into_extent;
+
+       iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+                                    POS(0, reflink_offset),
+                                    BTREE_ITER_SLOTS, 1);
+       ret = PTR_ERR_OR_ZERO(iter);
+       if (ret)
+               return ret;
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_reflink_v) {
+               __bcache_io_error(trans->c,
+                               "pointer to nonexistent indirect extent");
+               ret = -EIO;
+               goto err;
+       }
+
+       *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+       bkey_reassemble(orig_k, k);
+err:
+       bch2_trans_iter_put(trans, iter);
+       return ret;
+}
+
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
                       struct bvec_iter iter, struct bkey_s_c k,
+                      unsigned offset_into_extent,
                       struct bch_io_failures *failed, unsigned flags)
 {
        struct extent_ptr_decoded pick;
@@ -1598,7 +1651,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
                if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
                        goto hole;
 
-               iter.bi_sector  = pos.offset;
                iter.bi_size    = pick.crc.compressed_size << 9;
                goto noclone;
        }
@@ -1607,13 +1659,13 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
            bio_flagged(&orig->bio, BIO_CHAIN))
                flags |= BCH_READ_MUST_CLONE;
 
-       narrow_crcs = should_narrow_crcs(k, &pick, flags);
+       narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+               bch2_can_narrow_extent_crcs(k, pick.crc);
 
        if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
                flags |= BCH_READ_MUST_BOUNCE;
 
-       EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
-               k.k->p.offset < bvec_iter_end_sector(iter));
+       BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
        if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
            (pick.crc.csum_type != BCH_CSUM_NONE &&
@@ -1634,15 +1686,17 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
                        (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
                         bvec_iter_sectors(iter) != pick.crc.live_size ||
                         pick.crc.offset ||
-                        iter.bi_sector != pos.offset));
+                        offset_into_extent));
 
+               pos.offset += offset_into_extent;
                pick.ptr.offset += pick.crc.offset +
-                       (iter.bi_sector - pos.offset);
+                       offset_into_extent;
+               offset_into_extent              = 0;
                pick.crc.compressed_size        = bvec_iter_sectors(iter);
                pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
                pick.crc.offset                 = 0;
                pick.crc.live_size              = bvec_iter_sectors(iter);
-               pos.offset                      = iter.bi_sector;
+               offset_into_extent              = 0;
        }
 
        if (rbio) {
@@ -1697,6 +1751,7 @@ noclone:
        else
                rbio->end_io    = orig->bio.bi_end_io;
        rbio->bvec_iter         = iter;
+       rbio->offset_into_extent= offset_into_extent;
        rbio->flags             = flags;
        rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
        rbio->narrow_crcs       = narrow_crcs;
@@ -1815,45 +1870,67 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
        rbio->c = c;
        rbio->start_time = local_clock();
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
-                          POS(inode, rbio->bio.bi_iter.bi_sector),
-                          BTREE_ITER_SLOTS, k, ret) {
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+                                  POS(inode, rbio->bio.bi_iter.bi_sector),
+                                  BTREE_ITER_SLOTS);
+
+       while (1) {
                BKEY_PADDED(k) tmp;
-               unsigned bytes;
+               unsigned bytes, sectors, offset_into_extent;
+
+               bch2_btree_iter_set_pos(iter,
+                               POS(inode, rbio->bio.bi_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               bkey_reassemble(&tmp.k, k);
+               k = bkey_i_to_s_c(&tmp.k);
+
+               offset_into_extent = iter->pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               ret = bch2_read_indirect_extent(&trans, iter,
+                                       &offset_into_extent, &tmp.k);
+               if (ret)
+                       goto err;
+
+               /*
+                * With indirect extents, the amount of data to read is the min
+                * of the original extent and the indirect extent:
+                */
+               sectors = min(sectors, k.k->size - offset_into_extent);
 
                /*
                 * Unlock the iterator while the btree node's lock is still in
                 * cache, before doing the IO:
                 */
-               bkey_reassemble(&tmp.k, k);
-               k = bkey_i_to_s_c(&tmp.k);
                bch2_trans_unlock(&trans);
 
-               bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
-                             (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
                swap(rbio->bio.bi_iter.bi_size, bytes);
 
                if (rbio->bio.bi_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
-               bch2_read_extent(c, rbio, k, flags);
+               bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
-                       return;
+                       break;
 
                swap(rbio->bio.bi_iter.bi_size, bytes);
                bio_advance(&rbio->bio, bytes);
        }
-
-       /*
-        * If we get here, it better have been because there was an error
-        * reading a btree node
-        */
-       BUG_ON(!ret);
-       bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
-
+out:
        bch2_trans_exit(&trans);
+       return;
+err:
+       bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
        bch2_rbio_done(rbio);
+       goto out;
 }
 
 void bch2_fs_io_exit(struct bch_fs *c)
index 1e8470afbeca60492e6880ae83015aaf64736639..7db3bd0ea19a3e1ffe4ff9944afe932e72a3554b 100644 (file)
@@ -95,9 +95,8 @@ struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_ptr_decoded;
 
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-                      struct bkey_s_c, struct bch_io_failures *, unsigned);
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *,
+                             unsigned *, struct bkey_i *);
 
 enum bch_read_flags {
        BCH_READ_RETRY_IF_STALE         = 1 << 0,
@@ -112,14 +111,22 @@ enum bch_read_flags {
        BCH_READ_IN_RETRY               = 1 << 7,
 };
 
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+                      struct bvec_iter, struct bkey_s_c, unsigned,
+                      struct bch_io_failures *, unsigned);
+
 static inline void bch2_read_extent(struct bch_fs *c,
                                    struct bch_read_bio *rbio,
                                    struct bkey_s_c k,
+                                   unsigned offset_into_extent,
                                    unsigned flags)
 {
-       __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+       __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+                          offset_into_extent, NULL, flags);
 }
 
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
 static inline struct bch_read_bio *rbio_init(struct bio *bio,
                                             struct bch_io_opts opts)
 {
index 04f6d9a7c9a2af5c4fd71aed870f71062ab21a4e..2d397e5e5b9e8ca9051f06f748862064bf1b2b78 100644 (file)
@@ -38,6 +38,8 @@ struct bch_read_bio {
         */
        struct bvec_iter        bvec_iter;
 
+       unsigned                offset_into_extent;
+
        u16                     flags;
        union {
        struct {
index ad41f5e36a7c3e5a4b53626d603c8315f6490a07..dc3b03d6e627cb1aac2359a580176dc62482894d 100644 (file)
@@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
        return 0;
 }
 
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
+                                  enum btree_id btree_id)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -44,13 +45,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-                                  POS_MIN, BTREE_ITER_PREFETCH);
+       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+                                  BTREE_ITER_PREFETCH);
 
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
-               if (!bkey_extent_is_data(k.k) ||
-                   !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+               if (!bch2_bkey_has_device(k, dev_idx)) {
                        ret = bch2_mark_bkey_replicas(c, k);
                        if (ret)
                                break;
@@ -99,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        return ret;
 }
 
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       return  __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
+               __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+}
+
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
        struct btree_trans trans;
index e7e58afed5ddf8e8a4519fe247bc6eb81cdc0943..9595ba7910d81ded7dfb3b6c778818d1ff1e8712 100644 (file)
@@ -64,13 +64,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, m->btree_id,
                                   bkey_start_pos(&bch2_keylist_front(keys)->k),
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        while (1) {
                struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-               struct bkey_i_extent *insert, *new =
+               struct bkey_i *insert;
+               struct bkey_i_extent *new =
                        bkey_i_to_extent(bch2_keylist_front(keys));
                BKEY_PADDED(k) _new, _insert;
                const union bch_extent_entry *entry;
@@ -83,32 +84,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                        break;
 
                if (bversion_cmp(k.k->version, new->k.version) ||
-                   !bkey_extent_is_data(k.k) ||
-                   !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
-                                            m->ptr, m->offset))
+                   !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
                        goto nomatch;
 
                if (m->data_cmd == DATA_REWRITE &&
-                   !bch2_extent_has_device(bkey_s_c_to_extent(k),
-                                           m->data_opts.rewrite_dev))
+                   !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
                        goto nomatch;
 
                bkey_reassemble(&_insert.k, k);
-               insert = bkey_i_to_extent(&_insert.k);
+               insert = &_insert.k;
 
                bkey_copy(&_new.k, bch2_keylist_front(keys));
                new = bkey_i_to_extent(&_new.k);
 
-               bch2_cut_front(iter->pos, &insert->k_i);
+               bch2_cut_front(iter->pos, insert);
                bch2_cut_back(new->k.p, &insert->k);
                bch2_cut_back(insert->k.p, &new->k);
 
                if (m->data_cmd == DATA_REWRITE)
-                       bch2_bkey_drop_device(extent_i_to_s(insert).s,
+                       bch2_bkey_drop_device(bkey_i_to_s(insert),
                                              m->data_opts.rewrite_dev);
 
                extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
-                       if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
+                       if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
                                /*
                                 * raced with another move op? extent already
                                 * has a pointer to the device we just wrote
@@ -124,18 +122,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                if (!did_work)
                        goto nomatch;
 
-               bch2_extent_narrow_crcs(insert,
+               bch2_bkey_narrow_crcs(insert,
                                (struct bch_extent_crc_unpacked) { 0 });
-               bch2_extent_normalize(c, extent_i_to_s(insert).s);
-               bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
-                                                op->opts.background_target,
-                                                op->opts.data_replicas);
+               bch2_extent_normalize(c, bkey_i_to_s(insert));
+               bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+                                              op->opts.background_target,
+                                              op->opts.data_replicas);
 
                /*
                 * If we're not fully overwriting @k, and it's compressed, we
                 * need a reservation for all the pointers in @insert
                 */
-               nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+               nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
                         m->nr_ptrs_reserved;
 
                if (insert->k.size < k.k->size &&
@@ -151,7 +149,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                }
 
                bch2_trans_update(&trans,
-                               BTREE_INSERT_ENTRY(iter, &insert->k_i));
+                               BTREE_INSERT_ENTRY(iter, insert));
 
                ret = bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
@@ -216,10 +214,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
                            struct bch_io_opts io_opts,
                            enum data_cmd data_cmd,
                            struct data_opts data_opts,
+                           enum btree_id btree_id,
                            struct bkey_s_c k)
 {
        int ret;
 
+       m->btree_id     = btree_id;
        m->data_cmd     = data_cmd;
        m->data_opts    = data_opts;
        m->nr_ptrs_reserved = 0;
@@ -267,11 +267,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
                break;
        }
        case DATA_REWRITE: {
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
                unsigned compressed_sectors = 0;
 
-               extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                        if (!p.ptr.cached &&
                            p.crc.compression_type != BCH_COMPRESSION_NONE &&
                            bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
@@ -395,14 +396,16 @@ static int bch2_move_extent(struct bch_fs *c,
                            struct moving_context *ctxt,
                            struct write_point_specifier wp,
                            struct bch_io_opts io_opts,
-                           struct bkey_s_c_extent e,
+                           enum btree_id btree_id,
+                           struct bkey_s_c k,
                            enum data_cmd data_cmd,
                            struct data_opts data_opts)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        struct moving_io *io;
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       unsigned sectors = e.k->size, pages;
+       unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
        move_ctxt_wait_event(ctxt,
@@ -414,7 +417,7 @@ static int bch2_move_extent(struct bch_fs *c,
                SECTORS_IN_FLIGHT_PER_DEVICE);
 
        /* write path might have to decompress data: */
-       extent_for_each_ptr_decode(e, p, entry)
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
 
        pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
@@ -424,8 +427,8 @@ static int bch2_move_extent(struct bch_fs *c,
                goto err;
 
        io->write.ctxt          = ctxt;
-       io->read_sectors        = e.k->size;
-       io->write_sectors       = e.k->size;
+       io->read_sectors        = k.k->size;
+       io->write_sectors       = k.k->size;
 
        bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
        bio_set_prio(&io->write.op.wbio.bio,
@@ -442,18 +445,18 @@ static int bch2_move_extent(struct bch_fs *c,
        io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
        bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
-       io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
+       io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
        ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
-                                     data_cmd, data_opts, e.s_c);
+                                     data_cmd, data_opts, btree_id, k);
        if (ret)
                goto err_free_pages;
 
        atomic64_inc(&ctxt->stats->keys_moved);
-       atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+       atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
 
-       trace_move_extent(e.k);
+       trace_move_extent(k.k);
 
        atomic_add(io->read_sectors, &ctxt->read_sectors);
        list_add_tail(&io->list, &ctxt->reads);
@@ -463,7 +466,7 @@ static int bch2_move_extent(struct bch_fs *c,
         * ctxt when doing wakeup
         */
        closure_get(&ctxt->cl);
-       bch2_read_extent(c, &io->rbio, e.s_c,
+       bch2_read_extent(c, &io->rbio, k, 0,
                         BCH_READ_NODECODE|
                         BCH_READ_LAST_FRAGMENT);
        return 0;
@@ -472,20 +475,21 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       trace_move_alloc_fail(e.k);
+       trace_move_alloc_fail(k.k);
        return ret;
 }
 
-int bch2_move_data(struct bch_fs *c,
-                  struct bch_ratelimit *rate,
-                  struct write_point_specifier wp,
-                  struct bpos start,
-                  struct bpos end,
-                  move_pred_fn pred, void *arg,
-                  struct bch_move_stats *stats)
+static int __bch2_move_data(struct bch_fs *c,
+               struct moving_context *ctxt,
+               struct bch_ratelimit *rate,
+               struct write_point_specifier wp,
+               struct bpos start,
+               struct bpos end,
+               move_pred_fn pred, void *arg,
+               struct bch_move_stats *stats,
+               enum btree_id btree_id)
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
-       struct moving_context ctxt = { .stats = stats };
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        BKEY_PADDED(k) tmp;
        struct btree_trans trans;
@@ -496,17 +500,13 @@ int bch2_move_data(struct bch_fs *c,
        u64 delay, cur_inum = U64_MAX;
        int ret = 0, ret2;
 
-       closure_init_stack(&ctxt.cl);
-       INIT_LIST_HEAD(&ctxt.reads);
-       init_waitqueue_head(&ctxt.wait);
-
        bch2_trans_init(&trans, c, 0, 0);
 
        stats->data_type = BCH_DATA_USER;
-       stats->btree_id = BTREE_ID_EXTENTS;
+       stats->btree_id = btree_id;
        stats->pos      = POS_MIN;
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+       iter = bch2_trans_get_iter(&trans, btree_id, start,
                                   BTREE_ITER_PREFETCH);
 
        if (rate)
@@ -531,7 +531,7 @@ int bch2_move_data(struct bch_fs *c,
 
                        if (unlikely(freezing(current))) {
                                bch2_trans_unlock(&trans);
-                               move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+                               move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
                                try_to_freeze();
                        }
                } while (delay);
@@ -582,13 +582,12 @@ peek:
                k = bkey_i_to_s_c(&tmp.k);
                bch2_trans_unlock(&trans);
 
-               ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
-                                       bkey_s_c_to_extent(k),
+               ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);
                if (ret2) {
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(&ctxt);
+                               bch2_move_ctxt_wait_for_io(ctxt);
                                continue;
                        }
 
@@ -606,7 +605,32 @@ next_nondata:
                bch2_trans_cond_resched(&trans);
        }
 out:
-       bch2_trans_exit(&trans);
+       ret = bch2_trans_exit(&trans) ?: ret;
+
+       return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+                  struct bch_ratelimit *rate,
+                  struct write_point_specifier wp,
+                  struct bpos start,
+                  struct bpos end,
+                  move_pred_fn pred, void *arg,
+                  struct bch_move_stats *stats)
+{
+       struct moving_context ctxt = { .stats = stats };
+       int ret;
+
+       closure_init_stack(&ctxt.cl);
+       INIT_LIST_HEAD(&ctxt.reads);
+       init_waitqueue_head(&ctxt.wait);
+
+       stats->data_type = BCH_DATA_USER;
+
+       ret =   __bch2_move_data(c, &ctxt, rate, wp, start, end,
+                                pred, arg, stats, BTREE_ID_EXTENTS) ?:
+               __bch2_move_data(c, &ctxt, rate, wp, start, end,
+                                pred, arg, stats, BTREE_ID_REFLINK);
 
        move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
        closure_sync(&ctxt.cl);
index 71b3d2b2ddb6ddbcc1ef744a5e00676578563336..0acd1720d4f8571b3f7cf6f3e883be575c952d2b 100644 (file)
@@ -25,6 +25,7 @@ struct data_opts {
 };
 
 struct migrate_write {
+       enum btree_id           btree_id;
        enum data_cmd           data_cmd;
        struct data_opts        data_opts;
 
@@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
                            struct write_point_specifier,
                            struct bch_io_opts,
                            enum data_cmd, struct data_opts,
-                           struct bkey_s_c);
+                           enum btree_id, struct bkey_s_c);
 
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
                                struct bkey_s_c,
index b13af5662f220e73d7f5f354721dbcb5d4e1a4eb..710296044194f349978cbc8d74d94091b48dea8d 100644 (file)
@@ -69,26 +69,19 @@ static bool __copygc_pred(struct bch_dev *ca,
                          struct bkey_s_c k)
 {
        copygc_heap *h = &ca->copygc_heap;
+       const struct bch_extent_ptr *ptr =
+               bch2_bkey_has_device(k, ca->dev_idx);
 
-       switch (k.k->type) {
-       case KEY_TYPE_extent: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               const struct bch_extent_ptr *ptr =
-                       bch2_extent_has_device(e, ca->dev_idx);
+       if (ptr) {
+               struct copygc_heap_entry search = { .offset = ptr->offset };
 
-               if (ptr) {
-                       struct copygc_heap_entry search = { .offset = ptr->offset };
+               ssize_t i = eytzinger0_find_le(h->data, h->used,
+                                              sizeof(h->data[0]),
+                                              bucket_offset_cmp, &search);
 
-                       ssize_t i = eytzinger0_find_le(h->data, h->used,
-                                                      sizeof(h->data[0]),
-                                                      bucket_offset_cmp, &search);
-
-                       return (i >= 0 &&
-                               ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-                               ptr->gen == h->data[i].gen);
-               }
-               break;
-       }
+               return (i >= 0 &&
+                       ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+                       ptr->gen == h->data[i].gen);
        }
 
        return false;
index 6bdd68177ac94896d5935ddcffd57e414acf68a9..4797d620fe7723fb8c9591fc4f9b8ee7e4f9642f 100644 (file)
@@ -38,9 +38,9 @@ void bch2_rebalance_add_key(struct bch_fs *c,
                            struct bkey_s_c k,
                            struct bch_io_opts *io_opts)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       struct bkey_s_c_extent e;
 
        if (!bkey_extent_is_data(k.k))
                return;
@@ -49,9 +49,7 @@ void bch2_rebalance_add_key(struct bch_fs *c,
            !io_opts->background_compression)
                return;
 
-       e = bkey_s_c_to_extent(k);
-
-       extent_for_each_ptr_decode(e, p, entry)
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                if (rebalance_ptr_pred(c, p, io_opts)) {
                        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
index 92867b5c078f744a7044f1e3ca8b0c4326cdebd9..f2899ba9ad43822e33d58aa7e4c5088b849db823 100644 (file)
@@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq)
                bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
+                                 struct bkey_i *k)
 {
        struct btree_trans trans;
        struct btree_iter *iter, *split_iter;
@@ -247,6 +248,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
        struct disk_reservation disk_res =
                bch2_disk_reservation_init(c, 0);
        struct bkey_i *split;
+       struct bpos atomic_end;
        bool split_compressed = false;
        int ret;
 
@@ -254,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+       iter = bch2_trans_get_iter(&trans, btree_id,
                                   bkey_start_pos(&k->k),
                                   BTREE_ITER_INTENT);
 
@@ -273,9 +275,14 @@ retry:
                if (ret)
                        goto err;
 
+               ret = bch2_extent_atomic_end(&trans, split_iter,
+                                            k, &atomic_end);
+               if (ret)
+                       goto err;
+
                if (!split_compressed &&
                    bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
-                   !bch2_extent_is_atomic(k, split_iter)) {
+                   bkey_cmp(atomic_end, k->k.p) < 0) {
                        ret = bch2_disk_reservation_add(c, &disk_res,
                                        k->k.size *
                                        bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
@@ -287,7 +294,7 @@ retry:
 
                bkey_copy(split, k);
                bch2_cut_front(split_iter->pos, split);
-               bch2_extent_trim_atomic(split, split_iter);
+               bch2_cut_back(atomic_end, &split->k);
 
                bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
                bch2_btree_iter_set_pos(iter, split->k.p);
@@ -295,7 +302,7 @@ retry:
 
        if (split_compressed) {
                ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
-                                         -((s64) k->k.size),
+                                         0, -((s64) k->k.size),
                                          BCH_BUCKET_MARK_OVERWRITE) ?:
                      bch2_trans_commit(&trans, &disk_res, NULL,
                                        BTREE_INSERT_ATOMIC|
@@ -335,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c,
        for_each_journal_key(keys, i) {
                replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
-               switch (i->btree_id) {
-               case BTREE_ID_ALLOC:
+               if (i->btree_id == BTREE_ID_ALLOC)
                        ret = bch2_alloc_replay_key(c, i->k);
-                       break;
-               case BTREE_ID_EXTENTS:
-                       ret = bch2_extent_replay_key(c, i->k);
-                       break;
-               default:
+               else if (btree_node_type_is_extents(i->btree_id))
+                       ret = bch2_extent_replay_key(c, i->btree_id, i->k);
+               else
                        ret = bch2_btree_insert(c, i->btree_id, i->k,
                                                NULL, NULL,
                                                BTREE_INSERT_NOFAIL|
                                                BTREE_INSERT_LAZY_RW|
                                                BTREE_INSERT_JOURNAL_REPLAY|
                                                BTREE_INSERT_NOMARK);
-                       break;
-               }
 
                if (ret) {
                        bch_err(c, "journal replay: error %d while replaying key",
diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c
new file mode 100644 (file)
index 0000000..dcca9c1
--- /dev/null
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "reflink.h"
+
+#include <linux/sched/signal.h>
+
+/* reflink pointers */
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+       if (bkey_val_bytes(p.k) != sizeof(*p.v))
+               return "incorrect value size";
+
+       return NULL;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
+{
+       struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+       pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+}
+
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
+                                      struct bkey_s _l, struct bkey_s _r)
+{
+       struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+       struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+
+       if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+               return BCH_MERGE_NOMERGE;
+
+       if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+               bch2_key_resize(l.k, KEY_SIZE_MAX);
+               __bch2_cut_front(l.k->p, _r);
+               return BCH_MERGE_PARTIAL;
+       }
+
+       bch2_key_resize(l.k, l.k->size + r.k->size);
+
+       return BCH_MERGE_MERGE;
+}
+
+/* indirect extents */
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+       if (bkey_val_bytes(r.k) < sizeof(*r.v))
+               return "incorrect value size";
+
+       return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
+{
+       struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+       pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+       bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+/*
+ * bch2_remap_range() depends on bch2_extent_update(), which depends on various
+ * things tied to the linux vfs for inode updates, for now:
+ */
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+                                    struct btree_iter *extent_iter,
+                                    struct bkey_i_extent *e)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *reflink_iter;
+       struct bkey_s_c k;
+       struct bkey_i_reflink_v *r_v;
+       struct bkey_i_reflink_p *r_p;
+       int ret;
+
+       for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+                          POS(0, c->reflink_hint),
+                          BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+               if (reflink_iter->pos.inode) {
+                       bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+                       continue;
+               }
+
+               if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+                       break;
+       }
+
+       if (ret)
+               goto err;
+
+       /* rewind iter to start of hole, if necessary: */
+       bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+
+       r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+       ret = PTR_ERR_OR_ZERO(r_v);
+       if (ret)
+               goto err;
+
+       bkey_reflink_v_init(&r_v->k_i);
+       r_v->k.p        = reflink_iter->pos;
+       bch2_key_resize(&r_v->k, e->k.size);
+       r_v->k.version  = e->k.version;
+
+       set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
+                         bkey_val_u64s(&e->k));
+       r_v->v.refcount = 0;
+       memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+
+       r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
+       if (IS_ERR(r_p))
+               return PTR_ERR(r_p);
+
+       e->k.type = KEY_TYPE_reflink_p;
+       r_p = bkey_i_to_reflink_p(&e->k_i);
+       set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+       r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+       bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+err:
+       if (!IS_ERR(reflink_iter)) {
+               c->reflink_hint = reflink_iter->pos.offset;
+               bch2_trans_iter_put(trans, reflink_iter);
+       }
+
+       return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+       struct bkey_s_c k = bch2_btree_iter_peek(iter);
+
+       while (1) {
+               if (bkey_err(k))
+                       return k;
+
+               if (bkey_cmp(iter->pos, end) >= 0)
+                       return bkey_s_c_null;
+
+               if (k.k->type == KEY_TYPE_extent ||
+                   k.k->type == KEY_TYPE_reflink_p)
+                       return k;
+
+               k = bch2_btree_iter_next(iter);
+       }
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+                    struct bch_inode_info *dst_inode,
+                    struct bpos dst_start, struct bpos src_start,
+                    u64 remap_sectors, u64 new_i_size)
+{
+       struct btree_trans trans;
+       struct btree_iter *dst_iter, *src_iter;
+       struct bkey_s_c src_k;
+       BKEY_PADDED(k) new_dst, new_src;
+       struct bpos dst_end = dst_start, src_end = src_start;
+       struct bpos dst_want, src_want;
+       u64 src_done, dst_done;
+       int ret = 0;
+
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+               mutex_lock(&c->sb_lock);
+               if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+                       c->disk_sb.sb->features[0] |=
+                               cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
+
+                       bch2_write_super(c);
+               }
+               mutex_unlock(&c->sb_lock);
+       }
+
+       dst_end.offset += remap_sectors;
+       src_end.offset += remap_sectors;
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+
+       src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+                                        BTREE_ITER_INTENT, 1);
+       dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+                                        BTREE_ITER_INTENT, 2);
+
+       while (1) {
+               bch2_trans_begin_updates(&trans);
+               trans.mem_top = 0;
+
+               if (fatal_signal_pending(current)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+
+               src_k = get_next_src(src_iter, src_end);
+               ret = bkey_err(src_k);
+               if (ret)
+                       goto btree_err;
+
+               src_done = bpos_min(src_iter->pos, src_end).offset -
+                       src_start.offset;
+               dst_want = POS(dst_start.inode, dst_start.offset + src_done);
+
+               if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
+                       ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
+                                            dst_inode, new_i_size);
+                       if (ret)
+                               goto btree_err;
+                       continue;
+               }
+
+               BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
+
+               if (!bkey_cmp(dst_iter->pos, dst_end))
+                       break;
+
+               if (src_k.k->type == KEY_TYPE_extent) {
+                       bkey_reassemble(&new_src.k, src_k);
+                       src_k = bkey_i_to_s_c(&new_src.k);
+
+                       bch2_cut_front(src_iter->pos,   &new_src.k);
+                       bch2_cut_back(src_end,          &new_src.k.k);
+
+                       ret = bch2_make_extent_indirect(&trans, src_iter,
+                                               bkey_i_to_extent(&new_src.k));
+                       if (ret)
+                               goto btree_err;
+
+                       BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+               }
+
+               if (src_k.k->type == KEY_TYPE_reflink_p) {
+                       struct bkey_s_c_reflink_p src_p =
+                               bkey_s_c_to_reflink_p(src_k);
+                       struct bkey_i_reflink_p *dst_p =
+                               bkey_reflink_p_init(&new_dst.k);
+
+                       u64 offset = le64_to_cpu(src_p.v->idx) +
+                               (src_iter->pos.offset -
+                                bkey_start_offset(src_k.k));
+
+                       dst_p->v.idx = cpu_to_le64(offset);
+               } else {
+                       BUG();
+               }
+
+               new_dst.k.k.p = dst_iter->pos;
+               bch2_key_resize(&new_dst.k.k,
+                               min(src_k.k->p.offset - src_iter->pos.offset,
+                                   dst_end.offset - dst_iter->pos.offset));
+
+               ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
+                                        dst_iter, &new_dst.k,
+                                        new_i_size, false, true, NULL);
+               if (ret)
+                       goto btree_err;
+
+               dst_done = dst_iter->pos.offset - dst_start.offset;
+               src_want = POS(src_start.inode, src_start.offset + dst_done);
+               bch2_btree_iter_set_pos(src_iter, src_want);
+btree_err:
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       goto err;
+       }
+
+       BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
+err:
+       BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+
+       dst_done = dst_iter->pos.offset - dst_start.offset;
+       new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+
+       ret = bch2_trans_exit(&trans) ?: ret;
+
+       mutex_lock(&dst_inode->ei_update_lock);
+       if (dst_inode->v.i_size < new_i_size) {
+               i_size_write(&dst_inode->v, new_i_size);
+               ret = bch2_write_inode_size(c, dst_inode, new_i_size,
+                                           ATTR_MTIME|ATTR_CTIME);
+       }
+       mutex_unlock(&dst_inode->ei_update_lock);
+
+       return dst_done ?: ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h
new file mode 100644 (file)
index 0000000..327618c
--- /dev/null
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+                           struct bkey_s_c);
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
+                                      struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) {            \
+       .key_invalid    = bch2_reflink_p_invalid,               \
+       .val_to_text    = bch2_reflink_p_to_text,               \
+       .key_merge      = bch2_reflink_p_merge,         \
+}
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+                           struct bkey_s_c);
+
+
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) {            \
+       .key_invalid    = bch2_reflink_v_invalid,               \
+       .val_to_text    = bch2_reflink_v_to_text,               \
+}
+
+#ifndef NO_BCACHEFS_FS
+s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
+                    struct bpos, struct bpos, u64, u64);
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_REFLINK_H */
index 4818453c015a405a09d55924605b91fdaba1a95e..f84de35cee2bb9c8f2ca939bc1886c98f7d9e528 100644 (file)
@@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
                extent_to_replicas(k, e);
                break;
        case KEY_TYPE_extent:
+       case KEY_TYPE_reflink_v:
                e->data_type = BCH_DATA_USER;
                extent_to_replicas(k, e);
                break;