-22776fe9902b0b06d6aa18cd4c7f0c5ad35a95fa
+ece184f718c2b678738bc2c42906e90eeb8ba7dc
bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
- bch2_mark_key(c, k, 0, NULL, 0,
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
for_each_journal_key(*journal_keys, j)
if (j->btree_id == BTREE_ID_ALLOC)
- bch2_mark_key(c, bkey_i_to_s_c(j->k), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(j->k),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
GC_PHASE_BTREE_XATTRS,
GC_PHASE_BTREE_ALLOC,
GC_PHASE_BTREE_QUOTAS,
+ GC_PHASE_BTREE_REFLINK,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list;
+ /* REFLINK */
+ u64 reflink_hint;
+
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
x(xattr, 11) \
x(alloc, 12) \
x(quota, 13) \
- x(stripe, 14)
+ x(stripe, 14) \
+ x(reflink_p, 15) \
+ x(reflink_v, 16)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
struct bch_extent_ptr ptrs[0];
} __attribute__((packed, aligned(8)));
+/* Reflink: */
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+
+ __le32 reservation_generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+};
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[0];
+};
+
/* Optional/variable size superblock sections: */
struct bch_sb_field {
BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
BCH_FEATURE_EC = 4,
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
+ BCH_FEATURE_REFLINK = 6,
BCH_FEATURE_NR,
};
x(XATTRS, 3, "xattrs") \
x(ALLOC, 4, "alloc") \
x(QUOTAS, 5, "quotas") \
- x(EC, 6, "erasure_coding")
+ x(EC, 6, "erasure_coding") \
+ x(REFLINK, 7, "reflink")
enum btree_id {
#define x(kwd, val, name) BTREE_ID_##kwd = val,
k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
}
-#define bkey_val_end(_k) vstruct_idx((_k).v, bkey_val_u64s((_k).k))
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
BKEY_VAL_ACCESSORS(alloc);
BKEY_VAL_ACCESSORS(quota);
BKEY_VAL_ACCESSORS(stripe);
+BKEY_VAL_ACCESSORS(reflink_p);
+BKEY_VAL_ACCESSORS(reflink_v);
/* byte order helpers */
#include "extents.h"
#include "inode.h"
#include "quota.h"
+#include "reflink.h"
#include "xattr.h"
-const char * const bch_bkey_types[] = {
+const char * const bch2_bkey_types[] = {
#define x(name, nr) #name,
BCH_BKEY_TYPES()
#undef x
void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
{
- pr_buf(out, "u64s %u type %u ", k->u64s, k->type);
+ pr_buf(out, "u64s %u type %s ", k->u64s,
+ bch2_bkey_types[k->type]);
bch2_bpos_to_text(out, k->p);
if (likely(ops->val_to_text))
ops->val_to_text(out, c, k);
- else
- pr_buf(out, " %s", bch_bkey_types[k.k->type]);
}
void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey;
enum btree_node_type;
-extern const char * const bch_bkey_types[];
+extern const char * const bch2_bkey_types[];
enum merge_result {
BCH_MERGE_NOMERGE,
static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
struct btree *);
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
+{
+ unsigned n = ARRAY_SIZE(iter->data);
+
+ while (n && __btree_node_iter_set_end(iter, n - 1))
+ --n;
+
+ return n;
+}
+
struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
{
unsigned offset = __btree_node_key_to_offset(b, k);
{
struct btree_node_iter_set *set;
- printk(KERN_ERR "btree node iter with %u sets:\n", b->nsets);
+ printk(KERN_ERR "btree node iter with %u/%u sets:\n",
+ __btree_node_iter_used(iter), b->nsets);
btree_node_iter_for_each(iter, set) {
struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
char buf[100];
bch2_bkey_to_text(&PBUF(buf), &uk);
- printk(KERN_ERR "set %zu key %zi/%u: %s\n", t - b->set,
- k->_data - bset(b, t)->_data, bset(b, t)->u64s, buf);
+ printk(KERN_ERR "set %zu key %u: %s\n",
+ t - b->set, set->k, buf);
}
}
struct btree *b)
{
struct btree_node_iter_set *set, *s2;
+ struct bkey_packed *k, *p;
struct bset_tree *t;
+ if (bch2_btree_node_iter_end(iter))
+ return;
+
/* Verify no duplicates: */
btree_node_iter_for_each(iter, set)
btree_node_iter_for_each(iter, s2)
btree_node_iter_for_each(iter, set)
BUG_ON(set != iter->data &&
btree_node_iter_cmp(b, set[-1], set[0]) > 0);
+
+ k = bch2_btree_node_iter_peek_all(iter, b);
+
+ for_each_bset(b, t) {
+ if (iter->data[0].end == t->end_offset)
+ continue;
+
+ p = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
+
+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
+ }
}
void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
__bch2_btree_node_iter_advance(iter, b);
}
-static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
-{
- unsigned n = ARRAY_SIZE(iter->data);
-
- while (n && __btree_node_iter_set_end(iter, n - 1))
- --n;
-
- return n;
-}
-
/*
* Expensive:
*/
-struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
- struct btree *b,
- unsigned min_key_type)
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
+ struct btree *b)
{
struct bkey_packed *k, *prev = NULL;
- struct bkey_packed *orig_pos = bch2_btree_node_iter_peek_all(iter, b);
struct btree_node_iter_set *set;
struct bset_tree *t;
unsigned end = 0;
bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
- k = bch2_bkey_prev_filter(b, t,
- bch2_btree_node_iter_bset_pos(iter, b, t),
- min_key_type);
+ k = bch2_bkey_prev_all(b, t,
+ bch2_btree_node_iter_bset_pos(iter, b, t));
if (k &&
(!prev || bkey_iter_cmp(b, k, prev) > 0)) {
prev = k;
}
if (!prev)
- goto out;
+ return NULL;
/*
* We're manually memmoving instead of just calling sort() to ensure the
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
-out:
- if (btree_keys_expensive_checks(b)) {
- struct btree_node_iter iter2 = *iter;
- if (prev)
- __bch2_btree_node_iter_advance(&iter2, b);
+ bch2_btree_node_iter_verify(iter, b);
+ return prev;
+}
- while ((k = bch2_btree_node_iter_peek_all(&iter2, b)) != orig_pos) {
- BUG_ON(k->type >= min_key_type);
- __bch2_btree_node_iter_advance(&iter2, b);
- }
- }
+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter,
+ struct btree *b,
+ unsigned min_key_type)
+{
+ struct bkey_packed *prev;
+
+ do {
+ prev = bch2_btree_node_iter_prev_all(iter, b);
+ } while (prev && prev->type < min_key_type);
return prev;
}
return ret;
}
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
+ struct btree *);
struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
struct btree *, unsigned);
-static inline struct bkey_packed *
-bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
-{
- return bch2_btree_node_iter_prev_filter(iter, b, 0);
-}
-
static inline struct bkey_packed *
bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
{
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
- bch2_mark_key(c, k, k.k->size, NULL, 0, flags);
+ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags);
fsck_err:
return ret;
}
for_each_pending_btree_node_free(c, as, d)
if (d->index_update_done)
- bch2_mark_key(c, bkey_i_to_s_c(&d->key), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_GC);
mutex_unlock(&c->btree_interior_update_lock);
struct btree_iter *linked;
unsigned readers = 0;
- EBUG_ON(btree_node_read_locked(iter, b->level));
+ EBUG_ON(!btree_node_intent_locked(iter, b->level));
trans_for_each_iter(iter->trans, linked)
if (linked->l[b->level].b == b &&
#endif
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
+ struct btree *b,
+ struct bset_tree *t,
+ struct bkey_packed *k)
+{
+ struct btree_node_iter_set *set;
+
+ btree_node_iter_for_each(iter, set)
+ if (set->end == t->end_offset) {
+ set->k = __btree_node_key_to_offset(b, k);
+ bch2_btree_node_iter_sort(iter, b);
+ return;
+ }
+
+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
+}
+
static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
bch2_btree_node_iter_peek_all(node_iter, b),
&iter->k);
}
- return;
+
+ goto iter_current_key_not_modified;
found:
set->end = t->end_offset;
bkey_disassemble(l->b, k, &iter->k);
}
iter_current_key_not_modified:
-
/*
- * Interior nodes are special because iterators for interior nodes don't
- * obey the usual invariants regarding the iterator position:
- *
- * We may have whiteouts that compare greater than the iterator
- * position, and logically should be in the iterator, but that we
- * skipped past to find the first live key greater than the iterator
- * position. This becomes an issue when we insert a new key that is
- * greater than the current iterator position, but smaller than the
- * whiteouts we've already skipped past - this happens in the course of
- * a btree split.
- *
- * We have to rewind the iterator past to before those whiteouts here,
- * else bkey_node_iter_prev() is not going to work and who knows what
- * else would happen. And we have to do it manually, because here we've
- * already done the insert and the iterator is currently inconsistent:
- *
- * We've got multiple competing invariants, here - we have to be careful
- * about rewinding iterators for interior nodes, because they should
- * always point to the key for the child node the btree iterator points
- * to.
+ * When a new key is added, and the node iterator now points to that
+ * key, the iterator might have skipped past deleted keys that should
+ * come after the key the iterator now points to. We have to rewind to
+ * before those deleted keys - otherwise bch2_btree_node_iter_prev_all()
+ * breaks:
*/
- if (b->level && new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ if (!bch2_btree_node_iter_end(node_iter) &&
+ (b->level ||
+ (iter->flags & BTREE_ITER_IS_EXTENTS))) {
struct bset_tree *t;
- struct bkey_packed *k;
+ struct bkey_packed *k, *k2, *p;
+
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
for_each_bset(b, t) {
- if (bch2_bkey_to_bset(b, where) == t)
+ bool set_pos = false;
+
+ if (node_iter->data[0].end == t->end_offset)
continue;
- k = bch2_bkey_prev_all(b, t,
- bch2_btree_node_iter_bset_pos(node_iter, b, t));
- if (k &&
- bkey_iter_cmp(b, k, where) > 0) {
- struct btree_node_iter_set *set;
- unsigned offset =
- __btree_node_key_to_offset(b, bkey_next(k));
-
- btree_node_iter_for_each(node_iter, set)
- if (set->k == offset) {
- set->k = __btree_node_key_to_offset(b, k);
- bch2_btree_node_iter_sort(node_iter, b);
- goto next_bset;
- }
-
- bch2_btree_node_iter_push(node_iter, b, k,
- btree_bkey_last(b, t));
+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+
+ while ((p = bch2_bkey_prev_all(b, t, k2)) &&
+ bkey_iter_cmp(b, k, p) < 0) {
+ k2 = p;
+ set_pos = true;
}
-next_bset:
- t = t;
+
+ if (set_pos)
+ btree_node_iter_set_set_pos(node_iter,
+ b, t, k2);
}
}
+
+ bch2_btree_node_iter_verify(node_iter, b);
}
void bch2_btree_node_iter_fix(struct btree_iter *iter,
recheck:
while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
- bkey_deleted(k.k) &&
- bkey_cmp(bkey_start_pos(k.k), iter->pos) == 0)
+ bkey_cmp(k.k->p, iter->pos) <= 0)
bch2_btree_node_iter_advance(&l->iter, l->b);
/*
EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
EBUG_ON(bkey_deleted(k.k));
iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
return k;
}
iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
return (struct bkey_s_c) { &iter->k, NULL };
}
goto recheck;
}
- if (k.k &&
- !bkey_deleted(k.k) &&
- !bkey_cmp(iter->pos, k.k->p)) {
- iter->uptodate = BTREE_ITER_UPTODATE;
- return k;
- } else {
+ if (!k.k ||
+ bkey_deleted(k.k) ||
+ bkey_cmp(iter->pos, k.k->p)) {
/* hole */
bkey_init(&iter->k);
iter->k.p = iter->pos;
-
- iter->uptodate = BTREE_ITER_UPTODATE;
- return (struct bkey_s_c) { &iter->k, NULL };
+ k = (struct bkey_s_c) { &iter->k, NULL };
}
+
+ iter->uptodate = BTREE_ITER_UPTODATE;
+ __bch2_btree_iter_verify(iter, l->b);
+ return k;
}
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
iter->flags &= ~(BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
iter->flags |= flags & (BTREE_ITER_INTENT|BTREE_ITER_PREFETCH);
+
+ if ((iter->flags & BTREE_ITER_INTENT) &&
+ !bch2_btree_iter_upgrade(iter, 1)) {
+ trace_trans_restart_upgrade(trans->ip);
+ return ERR_PTR(-EINTR);
+ }
}
BUG_ON(iter->btree_id != btree_id);
(_start), (_flags))) ?: \
PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_peek(_iter, _flags)).k); \
- !ret && (_k).k; \
+ !_ret && (_k).k; \
(_ret) = PTR_ERR_OR_ZERO(((_k) = \
__bch2_btree_iter_next(_iter, _flags)).k))
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
- return type == BKEY_TYPE_EXTENTS;
+ switch (type) {
+ case BKEY_TYPE_EXTENTS:
+ case BKEY_TYPE_REFLINK:
+ return true;
+ default:
+ return false;
+ }
}
static inline bool btree_node_is_extents(struct btree *b)
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES:
case BKEY_TYPE_EC:
+ case BKEY_TYPE_REFLINK:
return true;
default:
return false;
: gc_pos_btree_root(as->btree_id)) >= 0 &&
gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0)
bch2_mark_key_locked(c, bkey_i_to_s_c(&d->key),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_OVERWRITE|
BCH_BUCKET_MARK_GC);
}
{
BUG_ON(!pending->index_update_done);
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
- BCH_BUCKET_MARK_OVERWRITE);
+ bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+ 0, 0, NULL, 0, BCH_BUCKET_MARK_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
- bch2_mark_key(c, bkey_i_to_s_c(&pending->key), 0, NULL, 0,
+ bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_OVERWRITE|
BCH_BUCKET_MARK_GC);
}
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT|
BCH_BUCKET_MARK_GC);
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_node(b)))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT|
BCH_BUCKET_MARK_GC);
fs_usage = bch2_fs_usage_scratch_get(c);
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
- 0, fs_usage, 0,
+ 0, 0, fs_usage, 0,
BCH_BUCKET_MARK_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
- 0, NULL, 0,
+ 0, 0, NULL, 0,
BCH_BUCKET_MARK_INSERT||
BCH_BUCKET_MARK_GC);
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
- !bch2_extent_is_atomic(i->k, i->iter));
-
+ bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0);
EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) &&
!(trans->flags & BTREE_INSERT_ATOMIC));
}
{
return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
(i->iter->btree_id == BTREE_ID_EXTENTS ||
- i->iter->btree_id == BTREE_ID_INODES);
+ i->iter->btree_id == BTREE_ID_INODES ||
+ i->iter->btree_id == BTREE_ID_REFLINK);
}
static inline bool update_has_triggers(struct btree_trans *trans,
bch2_trans_unlink_iters(trans, ~trans->iters_touched|
trans->iters_unlink_on_commit);
trans->iters_touched = 0;
- } else {
- bch2_trans_unlink_iters(trans, trans->iters_unlink_on_commit);
}
trans->nr_updates = 0;
trans->mem_top = 0;
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
- bch2_extent_trim_atomic(&delete, iter);
+
+ ret = bch2_extent_trim_atomic(&delete, iter);
+ if (ret)
+ break;
}
bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete));
*/
should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
if (WARN_ONCE(should_not_have_added > 0,
- "disk usage increased without a reservation")) {
+ "disk usage increased by %lli without a reservation",
+ should_not_have_added)) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
ret = -1;
}
static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
- s64 delta)
+ unsigned offset, s64 delta,
+ unsigned flags)
{
- if (delta > 0) {
- /*
- * marking a new extent, which _will have size_ @delta
- *
- * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
- * case, we haven't actually created the key we'll be inserting
- * yet (for the split) - so we don't want to be using
- * k->size/crc.live_size here:
- */
- return __ptr_disk_sectors(p, delta);
+ if (flags & BCH_BUCKET_MARK_OVERWRITE_SPLIT) {
+ BUG_ON(offset + -delta > p.crc.live_size);
+
+ return -((s64) ptr_disk_sectors(p)) +
+ __ptr_disk_sectors(p, offset) +
+ __ptr_disk_sectors(p, p.crc.live_size -
+ offset + delta);
+ } else if (flags & BCH_BUCKET_MARK_OVERWRITE) {
+ BUG_ON(offset + -delta > p.crc.live_size);
+
+ return -((s64) ptr_disk_sectors(p)) +
+ __ptr_disk_sectors(p, p.crc.live_size +
+ delta);
} else {
- BUG_ON(-delta > p.crc.live_size);
-
- return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
- (s64) ptr_disk_sectors(p);
+ return ptr_disk_sectors(p);
}
}
spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
- return -1;
+ return -EIO;
}
BUG_ON(m->r.e.data_type != data_type);
}
static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors, enum bch_data_type data_type,
+ unsigned offset, s64 sectors,
+ enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
unsigned journal_seq, unsigned flags)
{
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
- : ptr_disk_sectors_delta(p, sectors);
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
fs_usage, journal_seq, flags);
}
int bch2_mark_key_locked(struct bch_fs *c,
- struct bkey_s_c k, s64 sectors,
+ struct bkey_s_c k,
+ unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
? c->opts.btree_node_size
: -c->opts.btree_node_size;
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_BTREE,
+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE,
fs_usage, journal_seq, flags);
break;
case KEY_TYPE_extent:
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+ case KEY_TYPE_reflink_v:
+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
fs_usage, journal_seq, flags);
break;
case KEY_TYPE_stripe:
}
int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
- s64 sectors,
+ unsigned offset, s64 sectors,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
int ret;
percpu_down_read(&c->mark_lock);
- ret = bch2_mark_key_locked(c, k, sectors,
+ ret = bch2_mark_key_locked(c, k, offset, sectors,
fs_usage, journal_seq, flags);
percpu_up_read(&c->mark_lock);
{
struct bch_fs *c = trans->c;
struct btree *b = iter->l[0].b;
+ unsigned offset = 0;
s64 sectors = 0;
+ flags |= BCH_BUCKET_MARK_OVERWRITE;
+
if (btree_node_is_extents(b)
? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0
: bkey_cmp(new->k.p, old.k->p))
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
+ offset = 0;
sectors = -((s64) old.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
sectors = bkey_start_offset(&new->k) -
old.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
+ offset = 0;
sectors = bkey_start_offset(old.k) -
new->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- sectors = old.k->p.offset - new->k.p.offset;
- BUG_ON(sectors <= 0);
-
- bch2_mark_key_locked(c, old, sectors,
- fs_usage, trans->journal_res.seq,
- BCH_BUCKET_MARK_INSERT|flags);
-
- sectors = bkey_start_offset(&new->k) -
- old.k->p.offset;
+ offset = bkey_start_offset(&new->k) -
+ bkey_start_offset(old.k);
+ sectors = -((s64) new->k.size);
+ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
}
- return bch2_mark_key_locked(c, old, sectors, fs_usage,
- trans->journal_res.seq,
- BCH_BUCKET_MARK_OVERWRITE|flags) ?: 1;
+ return bch2_mark_key_locked(c, old, offset, sectors, fs_usage,
+ trans->journal_res.seq, flags) ?: 1;
}
int bch2_mark_update(struct btree_trans *trans,
if (!(trans->flags & BTREE_INSERT_NOMARK_INSERT))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k),
- bpos_min(insert->k->k.p, b->key.k.p).offset -
- bkey_start_offset(&insert->k->k),
+ 0, insert->k->k.size,
fs_usage, trans->journal_res.seq,
BCH_BUCKET_MARK_INSERT|flags);
xchg(&warned_disk_usage, 1))
return;
- pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+ bch_err(c, "disk usage increased more than %llu sectors reserved",
+ disk_res_sectors);
trans_for_each_update_iter(trans, i) {
struct btree_iter *iter = i->iter;
node_iter = iter->l[0].iter;
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
- KEY_TYPE_discard))) {
+ KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
struct btree_iter **iter,
struct bkey_s_c *k)
{
- unsigned i;
+ struct btree_insert_entry *i;
int ret;
- for (i = 0; i < trans->nr_updates; i++)
- if (!trans->updates[i].deferred &&
- trans->updates[i].iter->btree_id == btree_id &&
- !bkey_cmp(pos, trans->updates[i].iter->pos)) {
- *iter = trans->updates[i].iter;
- *k = bkey_i_to_s_c(trans->updates[i].k);
+ for (i = trans->updates;
+ i < trans->updates + trans->nr_updates;
+ i++)
+ if (!i->deferred &&
+ i->iter->btree_id == btree_id &&
+ (btree_node_type_is_extents(btree_id)
+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
+ bkey_cmp(pos, i->k->k.p) < 0
+ : !bkey_cmp(pos, i->iter->pos))) {
+ *iter = i->iter;
+ *k = bkey_i_to_s_c(i->k);
return 0;
}
if (IS_ERR(*iter))
return PTR_ERR(*iter);
+ bch2_trans_iter_free_on_commit(trans, *iter);
+
*k = bch2_btree_iter_peek_slot(*iter);
ret = bkey_err(*k);
if (ret)
struct bch_extent_stripe_ptr p,
s64 sectors, enum bch_data_type data_type)
{
+ struct bch_fs *c = trans->c;
struct bch_replicas_padded r;
struct btree_iter *iter;
struct bkey_i *new_k;
return ret;
if (k.k->type != KEY_TYPE_stripe) {
- bch_err_ratelimited(trans->c,
- "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- ret = -1;
+ bch2_fs_inconsistent(c,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.idx);
+ ret = -EIO;
goto out;
}
}
static int bch2_trans_mark_extent(struct btree_trans *trans,
- struct bkey_s_c k,
- s64 sectors, enum bch_data_type data_type)
+ struct bkey_s_c k, unsigned offset,
+ s64 sectors, unsigned flags,
+ enum bch_data_type data_type)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
- : ptr_disk_sectors_delta(p, sectors);
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
ret = bch2_trans_mark_pointer(trans, p, disk_sectors,
data_type);
return 0;
}
-int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 idx, unsigned sectors,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter;
+ struct bkey_i *new_k;
+ struct bkey_s_c k;
+ struct bkey_i_reflink_v *r_v;
+ s64 ret;
+
+ ret = trans_get_key(trans, BTREE_ID_REFLINK,
+ POS(0, idx), &iter, &k);
+ if (ret)
+ return ret;
+
+ if (k.k->type != KEY_TYPE_reflink_v) {
+ bch2_fs_inconsistent(c,
+ "%llu:%llu len %u points to nonexistent indirect extent %llu",
+ p.k->p.inode, p.k->p.offset, p.k->size, idx);
+ ret = -EIO;
+ goto err;
+ }
+
+ if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
+ (bkey_start_offset(k.k) < idx ||
+ k.k->p.offset > idx + sectors))
+ goto out;
+
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+
+ new_k = trans_update_key(trans, iter, k.k->u64s);
+ ret = PTR_ERR_OR_ZERO(new_k);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(new_k, k);
+ r_v = bkey_i_to_reflink_v(new_k);
+
+ le64_add_cpu(&r_v->v.refcount,
+ !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
+
+ if (!r_v->v.refcount) {
+ r_v->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&r_v->k, 0);
+ }
+out:
+ ret = k.k->p.offset - idx;
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p, unsigned offset,
s64 sectors, unsigned flags)
+{
+ u64 idx = le64_to_cpu(p.v->idx) + offset;
+ s64 ret = 0;
+
+ sectors = abs(sectors);
+ BUG_ON(offset + sectors > p.k->size);
+
+ while (sectors) {
+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
+ if (ret < 0)
+ break;
+
+ idx += ret;
+ sectors = max_t(s64, 0LL, sectors - ret);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
+ unsigned offset, s64 sectors, unsigned flags)
{
struct replicas_delta_list *d;
struct bch_fs *c = trans->c;
? c->opts.btree_node_size
: -c->opts.btree_node_size;
- return bch2_trans_mark_extent(trans, k, sectors,
- BCH_DATA_BTREE);
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
+ flags, BCH_DATA_BTREE);
case KEY_TYPE_extent:
- return bch2_trans_mark_extent(trans, k, sectors,
- BCH_DATA_USER);
+ case KEY_TYPE_reflink_v:
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
+ flags, BCH_DATA_USER);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);
d->fs_usage.persistent_reserved[replicas - 1] += sectors;
return 0;
}
+ case KEY_TYPE_reflink_p:
+ return bch2_trans_mark_reflink_p(trans,
+ bkey_s_c_to_reflink_p(k),
+ offset, sectors, flags);
default:
return 0;
}
if (!btree_node_type_needs_gc(iter->btree_id))
return 0;
- ret = bch2_trans_mark_key(trans,
- bkey_i_to_s_c(insert),
- bpos_min(insert->k.p, b->key.k.p).offset -
- bkey_start_offset(&insert->k),
- BCH_BUCKET_MARK_INSERT);
+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert),
+ 0, insert->k.size, BCH_BUCKET_MARK_INSERT);
if (ret)
return ret;
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k;
+ unsigned offset = 0;
s64 sectors = 0;
+ unsigned flags = BCH_BUCKET_MARK_OVERWRITE;
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)) {
switch (bch2_extent_overlap(&insert->k, k.k)) {
case BCH_EXTENT_OVERLAP_ALL:
+ offset = 0;
sectors = -((s64) k.k->size);
break;
case BCH_EXTENT_OVERLAP_BACK:
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
sectors = bkey_start_offset(&insert->k) -
k.k->p.offset;
break;
case BCH_EXTENT_OVERLAP_FRONT:
+ offset = 0;
sectors = bkey_start_offset(k.k) -
insert->k.p.offset;
break;
case BCH_EXTENT_OVERLAP_MIDDLE:
- sectors = k.k->p.offset - insert->k.p.offset;
- BUG_ON(sectors <= 0);
-
- ret = bch2_trans_mark_key(trans, k, sectors,
- BCH_BUCKET_MARK_INSERT);
- if (ret)
- return ret;
-
- sectors = bkey_start_offset(&insert->k) -
- k.k->p.offset;
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
+ sectors = -((s64) insert->k.size);
+ flags |= BCH_BUCKET_MARK_OVERWRITE_SPLIT;
break;
}
BUG_ON(sectors >= 0);
}
- ret = bch2_trans_mark_key(trans, k, sectors,
- BCH_BUCKET_MARK_OVERWRITE);
+ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags);
if (ret)
return ret;
#define BCH_BUCKET_MARK_INSERT (1 << 0)
#define BCH_BUCKET_MARK_OVERWRITE (1 << 1)
-#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 2)
-#define BCH_BUCKET_MARK_GC (1 << 3)
-#define BCH_BUCKET_MARK_ALLOC_READ (1 << 4)
-#define BCH_BUCKET_MARK_NOATOMIC (1 << 5)
+#define BCH_BUCKET_MARK_OVERWRITE_SPLIT (1 << 2)
+#define BCH_BUCKET_MARK_BUCKET_INVALIDATE (1 << 3)
+#define BCH_BUCKET_MARK_GC (1 << 4)
+#define BCH_BUCKET_MARK_ALLOC_READ (1 << 5)
+#define BCH_BUCKET_MARK_NOATOMIC (1 << 6)
-int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64,
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64,
struct bch_fs_usage *, u64, unsigned);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, unsigned);
void bch2_replicas_delta_list_apply(struct bch_fs *,
struct bch_fs_usage *,
struct replicas_delta_list *);
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, s64, unsigned);
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
+ unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *,
struct btree_iter *iter,
struct bkey_i *insert);
struct bch_stripe *v,
struct bkey_s_c k)
{
- struct bkey_s_c_extent e;
- const struct bch_extent_ptr *ptr;
- int idx;
- if (!bkey_extent_is_data(k.k))
- return -1;
-
- e = bkey_s_c_to_extent(k);
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const struct bch_extent_ptr *ptr;
+ int idx;
- extent_for_each_ptr(e, ptr) {
- idx = ptr_matches_stripe(c, v, ptr);
- if (idx >= 0)
- return idx;
+ extent_for_each_ptr(e, ptr) {
+ idx = ptr_matches_stripe(c, v, ptr);
+ if (idx >= 0)
+ return idx;
+ }
+ break;
+ }
}
return -1;
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
{
- struct bkey_s_c_extent e;
- const union bch_extent_entry *entry;
+ switch (k.k->type) {
+ case KEY_TYPE_extent: {
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ const union bch_extent_entry *entry;
- if (!bkey_extent_is_data(k.k))
- return false;
+ extent_for_each_entry(e, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
+ break;
+ }
+ }
return false;
}
break;
}
- bch2_mark_key(c, k, 0, NULL, 0,
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BCH_BUCKET_MARK_ALLOC_READ|
BCH_BUCKET_MARK_NOATOMIC);
}
bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
}
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->dev == dev)
+ return ptr;
+
+ return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const struct bch_extent_ptr *ptr;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
+ (!ptr->cached ||
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+ return true;
+
+ return false;
+}
+
/* extent specific utility code */
const struct bch_extent_ptr *
return NULL;
}
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
-{
- const struct bch_extent_ptr *ptr;
-
- extent_for_each_ptr(e, ptr)
- if (bch2_dev_in_target(c, ptr->dev, target) &&
- (!ptr->cached ||
- !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
- return ptr;
-
- return NULL;
-}
-
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
unsigned ret = 0;
return ret;
}
-bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
- struct bch_extent_ptr m, u64 offset)
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_extent_ptr m, u64 offset)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev == m.dev &&
p.ptr.gen == m.gen &&
- (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
(s64) m.offset - offset)
return true;
bch2_csum_type_is_encryption(n.csum_type);
}
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
struct bch_extent_crc_unpacked n)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
if (!n.csum_type)
return false;
- extent_for_each_crc(e, crc, i)
+ bkey_for_each_crc(k.k, ptrs, crc, i)
if (can_narrow_crc(crc, n))
return true;
* currently live (so that readers won't have to bounce) while we've got the
* checksum we need:
*/
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
- struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
struct bch_extent_crc_unpacked u;
struct extent_ptr_decoded p;
union bch_extent_entry *i;
/* Find a checksum entry that covers only live data: */
if (!n.csum_type) {
- extent_for_each_crc(extent_i_to_s(e), u, i)
+ bkey_for_each_crc(&k->k, ptrs, u, i)
if (!u.compression_type &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
found:
BUG_ON(n.compression_type);
BUG_ON(n.offset);
- BUG_ON(n.live_size != e->k.size);
+ BUG_ON(n.live_size != k->k.size);
restart_narrow_pointers:
- extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
if (can_narrow_crc(p.crc, n)) {
- bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
p.ptr.offset += p.crc.offset;
p.crc = n;
- bch2_extent_ptr_decoded_append(e, &p);
+ bch2_extent_ptr_decoded_append(k, &p);
ret = true;
goto restart_narrow_pointers;
}
/* Extents */
-bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+void __bch2_cut_front(struct bpos where, struct bkey_s k)
{
- u64 len = 0;
+ u64 sub;
if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
- return false;
+ return;
EBUG_ON(bkey_cmp(where, k.k->p) > 0);
- len = k.k->p.offset - where.offset;
+ sub = where.offset - bkey_start_offset(k.k);
- BUG_ON(len > k.k->size);
+ k.k->size -= sub;
- /*
- * Don't readjust offset if the key size is now 0, because that could
- * cause offset to point to the next bucket:
- */
- if (!len)
+ if (!k.k->size)
k.k->type = KEY_TYPE_deleted;
- else if (bkey_extent_is_data(k.k)) {
- struct bkey_s_extent e = bkey_s_to_extent(k);
+
+ switch (k.k->type) {
+ case KEY_TYPE_deleted:
+ case KEY_TYPE_discard:
+ case KEY_TYPE_error:
+ case KEY_TYPE_cookie:
+ break;
+ case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v: {
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
bool seen_crc = false;
- extent_for_each_entry(e, entry) {
+ bkey_extent_entry_for_each(ptrs, entry) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
if (!seen_crc)
- entry->ptr.offset += e.k->size - len;
+ entry->ptr.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc32:
- entry->crc32.offset += e.k->size - len;
+ entry->crc32.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc64:
- entry->crc64.offset += e.k->size - len;
+ entry->crc64.offset += sub;
break;
case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.offset += e.k->size - len;
+ entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
break;
if (extent_entry_is_crc(entry))
seen_crc = true;
}
- }
- k.k->size = len;
+ break;
+ }
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
- return true;
+ le64_add_cpu(&p.v->idx, sub);
+ break;
+ }
+ case KEY_TYPE_reservation:
+ break;
+ default:
+ BUG();
+ }
}
bool bch2_cut_back(struct bpos where, struct bkey *k)
len = where.offset - bkey_start_offset(k);
- BUG_ON(len > k->size);
-
k->p = where;
k->size = len;
bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
return;
+ /*
+ * may have skipped past some deleted extents greater than the insert
+ * key, before we got to a non deleted extent and knew we could bail out
+ * rewind the iterator a bit if necessary:
+ */
+ node_iter = l->iter;
+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+ bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
+ l->iter = node_iter;
+
k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
bch2_bset_insert(l->b, &l->iter, k, insert, 0);
return ret;
}
-static inline struct bpos
-bch2_extent_atomic_end(struct bkey_i *insert, struct btree_iter *iter)
+static int __bch2_extent_atomic_end(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned offset,
+ struct bpos *end,
+ unsigned *nr_iters,
+ unsigned max_iters)
+{
+ int ret = 0;
+
+ switch (k.k->type) {
+ case KEY_TYPE_extent:
+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ return 0;
+ }
+
+ break;
+ case KEY_TYPE_reflink_p: {
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ u64 idx = le64_to_cpu(p.v->idx);
+ unsigned sectors = end->offset - bkey_start_offset(p.k);
+ struct btree_iter *iter;
+ struct bkey_s_c r_k;
+
+ for_each_btree_key(trans, iter,
+ BTREE_ID_REFLINK, POS(0, idx + offset),
+ BTREE_ITER_SLOTS, r_k, ret) {
+ if (bkey_cmp(bkey_start_pos(r_k.k),
+ POS(0, idx + sectors)) >= 0)
+ break;
+
+ *nr_iters += 1;
+ if (*nr_iters >= max_iters) {
+ struct bpos pos = bkey_start_pos(k.k);
+ pos.offset += r_k.k->p.offset - idx;
+
+ *end = bpos_min(*end, pos);
+ break;
+ }
+ }
+
+ bch2_trans_iter_put(trans, iter);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int bch2_extent_atomic_end(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_i *insert,
+ struct bpos *end)
{
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
struct bkey_packed *_k;
- unsigned nr_alloc_ptrs =
+ unsigned nr_iters =
bch2_bkey_nr_alloc_ptrs(bkey_i_to_s_c(insert));
+ int ret = 0;
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
- while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ *end = bpos_min(insert->k.p, b->key.k.p);
+
+ ret = __bch2_extent_atomic_end(trans, bkey_i_to_s_c(insert),
+ 0, end, &nr_iters, 10);
+ if (ret)
+ return ret;
+
+ while (nr_iters < 20 &&
+ (_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+ unsigned offset = 0;
- if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
break;
- nr_alloc_ptrs += bch2_bkey_nr_alloc_ptrs(k);
+ if (bkey_cmp(bkey_start_pos(&insert->k),
+ bkey_start_pos(k.k)) > 0)
+ offset = bkey_start_offset(&insert->k) -
+ bkey_start_offset(k.k);
- if (nr_alloc_ptrs > 20) {
- BUG_ON(bkey_cmp(k.k->p, bkey_start_pos(&insert->k)) <= 0);
- return bpos_min(insert->k.p, k.k->p);
- }
+ ret = __bch2_extent_atomic_end(trans, k, offset,
+ end, &nr_iters, 20);
+ if (ret)
+ return ret;
+
+ if (nr_iters >= 20)
+ break;
bch2_btree_node_iter_advance(&node_iter, b);
}
- return bpos_min(insert->k.p, b->key.k.p);
+ return 0;
}
-void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
{
- bch2_cut_back(bch2_extent_atomic_end(k, iter), &k->k);
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+ if (ret)
+ return ret;
+
+ bch2_cut_back(end, &k->k);
+ return 0;
}
-bool bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
{
- return !bkey_cmp(bch2_extent_atomic_end(k, iter), k->k.p);
+ struct bpos end;
+ int ret;
+
+ ret = bch2_extent_atomic_end(iter->trans, iter, k, &end);
+ if (ret)
+ return ret;
+
+ return !bkey_cmp(end, k->k.p);
}
enum btree_insert_ret
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
}
-
- /*
- * may have skipped past some deleted extents greater than the insert
- * key, before we got to a non deleted extent and knew we could bail out
- * rewind the iterator a bit if necessary:
- */
- {
- struct btree_node_iter node_iter = l->iter;
-
- while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
- bkey_cmp_left_packed(l->b, _k, &insert->k.p) > 0)
- l->iter = node_iter;
- }
}
/**
#undef set_common_fields
}
-static void bch2_extent_crc_init(union bch_extent_crc *crc,
- struct bch_extent_crc_unpacked new)
+static void bch2_extent_crc_append(struct bkey_i *k,
+ struct bch_extent_crc_unpacked new)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ union bch_extent_crc *crc = (void *) ptrs.end;
+
if (bch_crc_bytes[new.csum_type] <= 4 &&
new.uncompressed_size - 1 <= CRC32_SIZE_MAX &&
new.nonce <= CRC32_NONCE_MAX)
BUG();
bch2_extent_crc_pack(crc, new);
-}
-void bch2_extent_crc_append(struct bkey_i_extent *e,
- struct bch_extent_crc_unpacked new)
-{
- bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
- __extent_entry_push(e);
+ k->k.u64s += extent_entry_u64s(ptrs.end);
+
+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
}
-static inline void __extent_entry_insert(struct bkey_i_extent *e,
+static inline void __extent_entry_insert(struct bkey_i *k,
union bch_extent_entry *dst,
union bch_extent_entry *new)
{
- union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
dst, (u64 *) end - (u64 *) dst);
- e->k.u64s += extent_entry_u64s(new);
+ k->k.u64s += extent_entry_u64s(new);
memcpy(dst, new, extent_entry_bytes(new));
}
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
struct extent_ptr_decoded *p)
{
- struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+ struct bch_extent_crc_unpacked crc =
+ bch2_extent_crc_unpack(&k->k, NULL);
union bch_extent_entry *pos;
unsigned i;
if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
- pos = e->v.start;
+ pos = ptrs.start;
goto found;
}
- extent_for_each_crc(extent_i_to_s(e), crc, pos)
+ bkey_for_each_crc(&k->k, ptrs, crc, pos)
if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
pos = extent_entry_next(pos);
goto found;
}
- bch2_extent_crc_append(e, p->crc);
- pos = extent_entry_last(extent_i_to_s(e));
+ bch2_extent_crc_append(k, p->crc);
+ pos = bkey_val_end(bkey_i_to_s(k));
found:
p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
- __extent_entry_insert(e, pos, to_entry(&p->ptr));
+ __extent_entry_insert(k, pos, to_entry(&p->ptr));
for (i = 0; i < p->ec_nr; i++) {
p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
- __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+ __extent_entry_insert(k, pos, to_entry(&p->ec[i]));
}
}
return false;
}
-void bch2_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e,
- unsigned target,
- unsigned nr_desired_replicas)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+ unsigned target,
+ unsigned nr_desired_replicas)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
if (target && extra > 0)
- extent_for_each_ptr_decode(e, p, entry) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra &&
}
if (extra > 0)
- extent_for_each_ptr_decode(e, p, entry) {
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra) {
/* extent entries: */
-#define extent_entry_last(_e) bkey_val_end(_e)
+#define extent_entry_last(_e) \
+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
#define entry_to_ptr(_entry) \
({ \
__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
_ptr, _entry)
+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
+({ \
+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
+ if (extent_entry_is_crc(_iter)) { \
+ (_crc) = bch2_extent_crc_unpack(_k, \
+ entry_to_crc(_iter)); \
+ break; \
+ } \
+ \
+ (_iter) < (_end); \
+})
+
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
+ (_iter) = (_start); \
+ bkey_crc_next(_k, _start, _end, _crc, _iter); \
+ (_iter) = extent_entry_next(_iter))
+
+#define bkey_for_each_crc(_k, _p, _crc, _iter) \
+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
+
/* utility code common to all keys with pointers: */
static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
return (struct bkey_ptrs_c) {
to_entry(&e.v->start[0]),
- to_entry(bkey_val_end(e))
+ to_entry(extent_entry_last(e))
};
}
case KEY_TYPE_extent: {
to_entry(&s.v->ptrs[s.v->nr_blocks]),
};
}
+ case KEY_TYPE_reflink_v: {
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ return (struct bkey_ptrs_c) {
+ r.v->start,
+ bkey_val_end(r),
+ };
+ }
default:
return (struct bkey_ptrs_c) { NULL, NULL };
}
return ret;
}
-static inline bool bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
-{
- struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
-
- bkey_for_each_ptr(p, ptr)
- if (ptr->dev == dev)
- return ptr;
-
- return NULL;
-}
-
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
struct bch_io_failures *,
struct extent_ptr_decoded *);
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
.key_merge = bch2_reservation_merge, \
}
-void bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-bool bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
enum btree_insert_ret
bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
void bch2_insert_fixup_extent(struct btree_trans *,
struct btree_insert_entry *);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
- unsigned, unsigned);
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+ unsigned, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
-bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
- struct bch_extent_ptr, u64);
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+ struct bch_extent_ptr, u64);
static inline bool bkey_extent_is_data(const struct bkey *k)
{
switch (k->type) {
case KEY_TYPE_btree_ptr:
case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_p:
+ case KEY_TYPE_reflink_v:
return true;
default:
return false;
}
}
+/*
+ * Should extent be counted under inode->i_sectors?
+ */
static inline bool bkey_extent_is_allocation(const struct bkey *k)
{
switch (k->type) {
case KEY_TYPE_extent:
case KEY_TYPE_reservation:
+ case KEY_TYPE_reflink_p:
+ case KEY_TYPE_reflink_v:
return true;
default:
return false;
}
}
-static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
-{
- return bkey_extent_is_allocation(k.k) &&
- !bch2_extent_is_compressed(k);
-}
-
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-
/* Extent entry iteration: */
#define extent_for_each_entry_from(_e, _entry, _start) \
#define extent_for_each_ptr(_e, _ptr) \
__bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
-#define extent_crc_next(_e, _crc, _iter) \
-({ \
- extent_for_each_entry_from(_e, _iter, _iter) \
- if (extent_entry_is_crc(_iter)) { \
- (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
- break; \
- } \
- \
- (_iter) < extent_entry_last(_e); \
-})
-
-#define extent_for_each_crc(_e, _crc, _iter) \
- for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
- (_iter) = (_e).v->start; \
- extent_crc_next(_e, _crc, _iter); \
- (_iter) = extent_entry_next(_iter))
-
#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
__bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
extent_entry_last(_e), _ptr, _entry)
-void bch2_extent_crc_append(struct bkey_i_extent *,
- struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i_extent *,
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
-static inline void __extent_entry_push(struct bkey_i_extent *e)
-{
- union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
-
- EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
- BKEY_EXTENT_VAL_U64s_MAX);
-
- e->k.u64s += extent_entry_u64s(entry);
-}
-
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
struct bch_extent_crc_unpacked);
-bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
struct bch_extent_ptr *);
} \
} while (0)
-bool __bch2_cut_front(struct bpos, struct bkey_s);
+void __bch2_cut_front(struct bpos, struct bkey_s);
-static inline bool bch2_cut_front(struct bpos where, struct bkey_i *k)
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
{
- return __bch2_cut_front(where, bkey_i_to_s(k));
+ __bch2_cut_front(where, bkey_i_to_s(k));
}
bool bch2_cut_back(struct bpos, struct bkey *);
#include "io.h"
#include "keylist.h"
#include "quota.h"
+#include "reflink.h"
#include <linux/aio.h>
#include <linux/backing-dev.h>
return 0;
}
-static int __must_check bch2_write_inode_size(struct bch_fs *c,
- struct bch_inode_info *inode,
- loff_t new_size, unsigned fields)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ loff_t new_size, unsigned fields)
{
struct inode_new_size s = {
.new_size = new_size,
return 0;
}
-static int bch2_extent_update(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct disk_reservation *disk_res,
- struct quota_res *quota_res,
- struct btree_iter *extent_iter,
- struct bkey_i *k,
- u64 new_i_size,
- bool may_allocate,
- bool direct,
- s64 *total_delta)
+int bch2_extent_update(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct disk_reservation *disk_res,
+ struct quota_res *quota_res,
+ struct btree_iter *extent_iter,
+ struct bkey_i *k,
+ u64 new_i_size,
+ bool may_allocate,
+ bool direct,
+ s64 *total_delta)
{
struct bch_fs *c = trans->c;
struct btree_iter *inode_iter = NULL;
s64 i_sectors_delta;
int ret;
- bch2_trans_begin_updates(trans);
-
ret = bch2_btree_iter_traverse(extent_iter);
if (ret)
return ret;
- bch2_extent_trim_atomic(k, extent_iter);
+ ret = bch2_extent_trim_atomic(k, extent_iter);
+ if (ret)
+ return ret;
ret = sum_sector_overwrites(trans, extent_iter,
k, &allocating,
bkey_copy(&tmp.k, bch2_keylist_front(keys));
+ bch2_trans_begin_updates(&trans);
+
ret = bch2_extent_update(&trans, inode,
&wop->res, quota_res,
iter, &tmp.k,
/* i_sectors: */
enum {
SECTOR_UNALLOCATED,
- SECTOR_QUOTA_RESERVED,
+ SECTOR_RESERVED,
SECTOR_DIRTY,
SECTOR_ALLOCATED,
} state:2;
};
struct bch_page_state {
+ atomic_t write_count;
struct bch_page_sector s[PAGE_SECTORS];
};
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
-static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
- struct page *page)
-{
- struct bch_page_state *s = bch2_page_state(page);
- struct disk_reservation disk_res = { 0 };
- struct quota_res quota_res = { 0 };
- unsigned i;
-
- if (!s)
- return;
-
- for (i = 0; i < ARRAY_SIZE(s->s); i++) {
- disk_res.sectors += s->s[i].replicas_reserved;
- s->s[i].replicas_reserved = 0;
-
- if (s->s[i].state == SECTOR_QUOTA_RESERVED) {
- quota_res.sectors++;
- s->s[i].state = SECTOR_UNALLOCATED;
- }
- }
-
- bch2_quota_reservation_put(c, inode, "a_res);
- bch2_disk_reservation_put(c, &disk_res);
-}
-
static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
{
/* XXX: this should not be open coded */
return 0;
}
-static int bch2_get_page_quota_reservation(struct bch_fs *c,
+struct bch2_page_reservation {
+ struct disk_reservation disk;
+ struct quota_res quota;
+};
+
+static void bch2_page_reservation_init(struct bch_fs *c,
struct bch_inode_info *inode,
- struct page *page, bool check_enospc)
+ struct bch2_page_reservation *res)
+{
+ memset(res, 0, sizeof(*res));
+
+ res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+static void bch2_page_reservation_put(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch2_page_reservation *res)
+{
+ bch2_disk_reservation_put(c, &res->disk);
+ bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+static int bch2_page_reservation_get(struct bch_fs *c,
+ struct bch_inode_info *inode, struct page *page,
+ struct bch2_page_reservation *res,
+ unsigned offset, unsigned len, bool check_enospc)
{
struct bch_page_state *s = bch2_page_state_create(page, 0);
- struct quota_res quota_res = { 0 };
- unsigned i, quota_res_sectors = 0;
+ unsigned i, disk_sectors = 0, quota_sectors = 0;
int ret;
if (!s)
return -ENOMEM;
- for (i = 0; i < ARRAY_SIZE(s->s); i++)
- quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED;
-
- if (!quota_res_sectors)
- return 0;
+ for (i = offset / 512;
+ i < DIV_ROUND_UP(offset + len, 512);
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
+ }
- ret = bch2_quota_reservation_add(c, inode, "a_res,
- quota_res_sectors,
- check_enospc);
- if (unlikely(ret))
- return ret;
+ if (disk_sectors) {
+ ret = bch2_disk_reservation_add(c, &res->disk,
+ disk_sectors,
+ !check_enospc
+ ? BCH_DISK_RESERVATION_NOFAIL
+ : 0);
+ if (unlikely(ret))
+ return ret;
+ }
- for (i = 0; i < ARRAY_SIZE(s->s); i++)
- if (s->s[i].state == SECTOR_UNALLOCATED)
- s->s[i].state = SECTOR_QUOTA_RESERVED;
+ if (quota_sectors) {
+ ret = bch2_quota_reservation_add(c, inode, &res->quota,
+ quota_sectors,
+ check_enospc);
+ if (unlikely(ret)) {
+ struct disk_reservation tmp = {
+ .sectors = disk_sectors
+ };
+
+ bch2_disk_reservation_put(c, &tmp);
+ res->disk.sectors -= disk_sectors;
+ return ret;
+ }
+ }
return 0;
}
-static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
- struct page *page, bool check_enospc)
-{
- return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?:
- bch2_get_page_quota_reservation(c, inode, page, check_enospc);
-}
-
static void bch2_clear_page_bits(struct page *page)
{
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state *s = bch2_page_state(page);
+ struct disk_reservation disk_res = { 0 };
int i, dirty_sectors = 0;
if (!s)
return;
for (i = 0; i < ARRAY_SIZE(s->s); i++) {
+ disk_res.sectors += s->s[i].replicas_reserved;
+ s->s[i].replicas_reserved = 0;
+
if (s->s[i].state == SECTOR_DIRTY) {
dirty_sectors++;
s->s[i].state = SECTOR_UNALLOCATED;
}
}
+ bch2_disk_reservation_put(c, &disk_res);
+
if (dirty_sectors)
i_sectors_acct(c, inode, NULL, -dirty_sectors);
- bch2_put_page_reservation(c, inode, page);
bch2_page_state_release(page);
}
-static void __bch2_set_page_dirty(struct page *page)
+static void bch2_set_page_dirty(struct bch_fs *c,
+ struct bch_inode_info *inode, struct page *page,
+ struct bch2_page_reservation *res,
+ unsigned offset, unsigned len)
{
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state *s = bch2_page_state(page);
- struct quota_res quota_res = { 0 };
unsigned i, dirty_sectors = 0;
- BUG_ON(!s);
+ for (i = offset / 512;
+ i < DIV_ROUND_UP(offset + len, 512);
+ i++) {
+ unsigned sectors = sectors_to_reserve(&s->s[i],
+ res->disk.nr_replicas);
- for (i = 0; i < ARRAY_SIZE(s->s); i++) {
- if (s->s[i].state == SECTOR_QUOTA_RESERVED)
- quota_res.sectors++;
+ BUG_ON(sectors > res->disk.sectors);
+ s->s[i].replicas_reserved += sectors;
+ res->disk.sectors -= sectors;
- if (s->s[i].state == SECTOR_UNALLOCATED ||
- s->s[i].state == SECTOR_QUOTA_RESERVED) {
- s->s[i].state = SECTOR_DIRTY;
+ if (s->s[i].state == SECTOR_UNALLOCATED)
dirty_sectors++;
- }
+
+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
}
if (dirty_sectors)
- i_sectors_acct(c, inode, "a_res, dirty_sectors);
- bch2_quota_reservation_put(c, inode, "a_res);
-}
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-static void bch2_set_page_dirty(struct page *page)
-{
- __bch2_set_page_dirty(page);
- __set_page_dirty_nobuffers(page);
+ if (!PageDirty(page))
+ __set_page_dirty_nobuffers(page);
}
vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct bch_inode_info *inode = file_bch_inode(file);
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation res;
int ret = VM_FAULT_LOCKED;
+ bch2_page_reservation_init(c, inode, &res);
+
sb_start_pagefault(inode->v.i_sb);
file_update_time(file);
goto out;
}
- if (bch2_get_page_reservation(c, inode, page, true)) {
+ if (bch2_page_reservation_get(c, inode, page, &res,
+ 0, PAGE_SIZE, true)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
goto out;
}
- if (!PageDirty(page))
- bch2_set_page_dirty(page);
+ bch2_set_page_dirty(c, inode, page, &res, 0, PAGE_SIZE);
wait_for_stable_page(page);
out:
if (current->pagecache_lock != &mapping->add_lock)
pagecache_add_put(&mapping->add_lock);
sb_end_pagefault(inode->v.i_sb);
+
+ bch2_page_reservation_put(c, inode, &res);
+
return ret;
}
}
#endif
-/* readpages/writepages: */
-
-static bool bio_can_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
- return bio->bi_vcnt < bio->bi_max_vecs &&
- bio_end_sector(bio) == offset;
-}
-
-static int bio_add_page_contig(struct bio *bio, struct page *page)
-{
- sector_t offset = (sector_t) page->index << PAGE_SECTOR_SHIFT;
-
- EBUG_ON(!bio->bi_max_vecs);
-
- if (!bio->bi_vcnt)
- bio->bi_iter.bi_sector = offset;
- else if (!bio_can_add_page_contig(bio, page))
- return -1;
-
- BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
- return 0;
-}
-
/* readpage(s): */
static void bch2_readpages_end_io(struct bio *bio)
{
struct bvec_iter iter;
struct bio_vec bv;
- unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
-
- BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k));
- BUG_ON(bio_end_sector(bio) > k.k->p.offset);
-
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+ ? 0 : bch2_bkey_nr_ptrs_allocated(k);
+ unsigned state = k.k->type == KEY_TYPE_reservation
+ ? SECTOR_RESERVED
+ : SECTOR_ALLOCATED;
bio_for_each_segment(bv, bio, iter) {
struct bch_page_state *s = bch2_page_state(bv.bv_page);
i < (bv.bv_offset + bv.bv_len) >> 9;
i++) {
s->s[i].nr_replicas = nr_ptrs;
- s->s[i].state = SECTOR_ALLOCATED;
+ s->s[i].state = state;
}
}
}
static void readpage_bio_extend(struct readpages_iter *iter,
- struct bio *bio, u64 offset,
+ struct bio *bio,
+ unsigned sectors_this_extent,
bool get_more)
{
- while (bio_end_sector(bio) < offset &&
+ while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) {
pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
struct page *page = readpage_iter_next(iter);
struct readpages_iter *readpages_iter)
{
struct bch_fs *c = trans->c;
- struct bio *bio = &rbio->bio;
int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE;
+ int ret = 0;
rbio->c = c;
rbio->start_time = local_clock();
-
+retry:
while (1) {
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
- bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
+ bch2_btree_iter_set_pos(iter,
+ POS(inum, rbio->bio.bi_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter);
- BUG_ON(!k.k);
-
- if (IS_ERR(k.k)) {
- int ret = btree_iter_err(iter);
- BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
- bio_endio(bio);
- return;
- }
+ ret = bkey_err(k);
+ if (ret)
+ break;
bkey_reassemble(&tmp.k, k);
- bch2_trans_unlock(trans);
k = bkey_i_to_s_c(&tmp.k);
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(trans, iter,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ bch2_trans_unlock(trans);
+
if (readpages_iter) {
bool want_full_extent = false;
if (bkey_extent_is_data(k.k)) {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *i;
struct extent_ptr_decoded p;
- extent_for_each_ptr_decode(e, p, i)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, i)
want_full_extent |= ((p.crc.csum_type != 0) |
(p.crc.compression_type != 0));
}
- readpage_bio_extend(readpages_iter,
- bio, k.k->p.offset,
- want_full_extent);
+ readpage_bio_extend(readpages_iter, &rbio->bio,
+ sectors, want_full_extent);
}
- bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
- bio->bi_iter.bi_sector) << 9;
- swap(bio->bi_iter.bi_size, bytes);
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+ swap(rbio->bio.bi_iter.bi_size, bytes);
- if (bytes == bio->bi_iter.bi_size)
+ if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
if (bkey_extent_is_allocation(k.k))
- bch2_add_page_sectors(bio, k);
+ bch2_add_page_sectors(&rbio->bio, k);
- bch2_read_extent(c, rbio, k, flags);
+ bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
return;
- swap(bio->bi_iter.bi_size, bytes);
- bio_advance(bio, bytes);
+ swap(rbio->bio.bi_iter.bi_size, bytes);
+ bio_advance(&rbio->bio, bytes);
}
+
+ if (ret == -EINTR)
+ goto retry;
+
+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bio_endio(&rbio->bio);
}
int bch2_readpages(struct file *file, struct address_space *mapping,
bch2_page_state_create(page, __GFP_NOFAIL);
bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
- bio_add_page_contig(&rbio->bio, page);
+ rbio->bio.bi_iter.bi_sector =
+ (sector_t) page->index << PAGE_SECTOR_SHIFT;
+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN,
struct bio *bio = &io->op.op.wbio.bio;
struct bvec_iter_all iter;
struct bio_vec *bvec;
- unsigned i;
+ unsigned i, j;
if (io->op.op.error) {
bio_for_each_segment_all(bvec, bio, i, iter) {
+ struct bch_page_state *s;
+
SetPageError(bvec->bv_page);
mapping_set_error(bvec->bv_page->mapping, -EIO);
+
+ lock_page(bvec->bv_page);
+ s = bch2_page_state(bvec->bv_page);
+ for (j = 0; j < PAGE_SECTORS; j++)
+ s->s[j].nr_replicas = 0;
+ unlock_page(bvec->bv_page);
}
}
i_sectors_acct(c, io->op.inode, NULL,
io->op.sectors_added - (s64) io->new_sectors);
- bio_for_each_segment_all(bvec, bio, i, iter)
- end_page_writeback(bvec->bv_page);
+ bio_for_each_segment_all(bvec, bio, i, iter) {
+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
+
+ if (atomic_dec_and_test(&s->write_count))
+ end_page_writeback(bvec->bv_page);
+ }
closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
}
static void bch2_writepage_io_alloc(struct bch_fs *c,
struct bch_writepage_state *w,
struct bch_inode_info *inode,
- struct page *page,
+ u64 sector,
unsigned nr_replicas)
{
struct bch_write_op *op;
- u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
w->io = container_of(bio_alloc_bioset(GFP_NOFS,
BIO_MAX_PAGES,
op->nr_replicas = nr_replicas;
op->res.nr_replicas = nr_replicas;
op->write_point = writepoint_hashed(inode->ei_last_dirtied);
- op->pos = POS(inode->v.i_ino, offset);
- op->wbio.bio.bi_iter.bi_sector = offset;
+ op->pos = POS(inode->v.i_ino, sector);
+ op->wbio.bio.bi_iter.bi_sector = sector;
}
static int __bch2_writepage(struct page *page,
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_writepage_state *w = data;
- struct bch_page_state *s;
- unsigned offset, nr_replicas_this_write = U32_MAX;
- unsigned dirty_sectors = 0, reserved_sectors = 0;
+ struct bch_page_state *s, orig;
+ unsigned i, offset, nr_replicas_this_write = U32_MAX;
loff_t i_size = i_size_read(&inode->v);
pgoff_t end_index = i_size >> PAGE_SHIFT;
- unsigned i;
int ret;
EBUG_ON(!PageUptodate(page));
return 0;
}
- for (i = 0; i < PAGE_SECTORS; i++)
+ /* Before unlocking the page, get copy of reservations: */
+ orig = *s;
+
+ for (i = 0; i < PAGE_SECTORS; i++) {
+ if (s->s[i].state < SECTOR_DIRTY)
+ continue;
+
nr_replicas_this_write =
min_t(unsigned, nr_replicas_this_write,
s->s[i].nr_replicas +
s->s[i].replicas_reserved);
-
- /* Before unlocking the page, transfer reservation to w->io: */
+ }
for (i = 0; i < PAGE_SECTORS; i++) {
+ if (s->s[i].state < SECTOR_DIRTY)
+ continue;
+
s->s[i].nr_replicas = w->opts.compression
? 0 : nr_replicas_this_write;
- reserved_sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
-
- dirty_sectors += s->s[i].state == SECTOR_DIRTY;
s->s[i].state = SECTOR_ALLOCATED;
}
+ BUG_ON(atomic_read(&s->write_count));
+ atomic_set(&s->write_count, 1);
+
BUG_ON(PageWriteback(page));
set_page_writeback(page);
+
unlock_page(page);
- if (w->io &&
- (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
- !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
- bch2_writepage_do_io(w);
+ offset = 0;
+ while (1) {
+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+ u64 sector;
+
+ while (offset < PAGE_SECTORS &&
+ orig.s[offset].state < SECTOR_DIRTY)
+ offset++;
+
+ if (offset == PAGE_SECTORS)
+ break;
+
+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
+
+ while (offset + sectors < PAGE_SECTORS &&
+ orig.s[offset + sectors].state >= SECTOR_DIRTY)
+ sectors++;
+
+ for (i = offset; i < offset + sectors; i++) {
+ reserved_sectors += orig.s[i].replicas_reserved;
+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
+ }
- if (!w->io)
- bch2_writepage_io_alloc(c, w, inode, page,
- nr_replicas_this_write);
+ if (w->io &&
+ (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
+ bio_full(&w->io->op.op.wbio.bio) ||
+ bio_end_sector(&w->io->op.op.wbio.bio) != sector))
+ bch2_writepage_do_io(w);
- w->io->new_sectors += dirty_sectors;
+ if (!w->io)
+ bch2_writepage_io_alloc(c, w, inode, sector,
+ nr_replicas_this_write);
- BUG_ON(inode != w->io->op.inode);
- BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+ w->io->new_sectors += dirty_sectors;
- w->io->op.op.res.sectors += reserved_sectors;
- w->io->op.new_i_size = i_size;
+ atomic_inc(&s->write_count);
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+ BUG_ON(inode != w->io->op.inode);
+ BUG_ON(!bio_add_page(&w->io->op.op.wbio.bio, page,
+ sectors << 9, offset << 9));
+
+ w->io->op.op.res.sectors += reserved_sectors;
+ w->io->op.new_i_size = i_size;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
+ offset += sectors;
+ }
+
+ if (atomic_dec_and_test(&s->write_count))
+ end_page_writeback(page);
return 0;
}
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation *res;
pgoff_t index = pos >> PAGE_SHIFT;
unsigned offset = pos & (PAGE_SIZE - 1);
struct page *page;
int ret = -ENOMEM;
- BUG_ON(inode_unhashed(&inode->v));
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return -ENOMEM;
+
+ bch2_page_reservation_init(c, inode, res);
+ *fsdata = res;
/* Not strictly necessary - same reason as mkwrite(): */
pagecache_add_get(&mapping->add_lock);
if (ret)
goto err;
out:
- ret = bch2_get_page_reservation(c, inode, page, true);
+ ret = bch2_page_reservation_get(c, inode, page, res,
+ offset, len, true);
if (ret) {
if (!PageUptodate(page)) {
/*
*pagep = NULL;
err_unlock:
pagecache_add_put(&mapping->add_lock);
+ kfree(res);
+ *fsdata = NULL;
return ret;
}
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch2_page_reservation *res = fsdata;
+ unsigned offset = pos & (PAGE_SIZE - 1);
lockdep_assert_held(&inode->v.i_rwsem);
if (copied) {
if (!PageUptodate(page))
SetPageUptodate(page);
- if (!PageDirty(page))
- bch2_set_page_dirty(page);
+
+ bch2_set_page_dirty(c, inode, page, res, offset, copied);
inode->ei_last_dirtied = (unsigned long) current;
- } else {
- bch2_put_page_reservation(c, inode, page);
}
unlock_page(page);
put_page(page);
pagecache_add_put(&mapping->add_lock);
+ bch2_page_reservation_put(c, inode, res);
+ kfree(res);
+
return copied;
}
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct page *pages[WRITE_BATCH_PAGES];
+ struct bch2_page_reservation res;
unsigned long index = pos >> PAGE_SHIFT;
unsigned offset = pos & (PAGE_SIZE - 1);
unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
- unsigned i, copied = 0, nr_pages_copied = 0;
+ unsigned i, reserved = 0, set_dirty = 0;
+ unsigned copied = 0, nr_pages_copied = 0;
int ret = 0;
BUG_ON(!len);
BUG_ON(nr_pages > ARRAY_SIZE(pages));
+ bch2_page_reservation_init(c, inode, &res);
+
for (i = 0; i < nr_pages; i++) {
pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
if (!pages[i]) {
}
}
- for (i = 0; i < nr_pages; i++) {
- ret = bch2_get_page_reservation(c, inode, pages[i], true);
-
- if (ret && !PageUptodate(pages[i])) {
- ret = bch2_read_single_page(pages[i], mapping);
- if (ret)
- goto out;
-
- ret = bch2_get_page_reservation(c, inode, pages[i], true);
+ while (reserved < len) {
+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
+ unsigned pg_len = min_t(unsigned, len - reserved,
+ PAGE_SIZE - pg_offset);
+retry_reservation:
+ ret = bch2_page_reservation_get(c, inode, page, &res,
+ pg_offset, pg_len, true);
+
+ if (ret && !PageUptodate(page)) {
+ ret = bch2_read_single_page(page, mapping);
+ if (!ret)
+ goto retry_reservation;
}
if (ret)
goto out;
+
+ reserved += pg_len;
}
if (mapping_writably_mapped(mapping))
while (copied < len) {
struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
- unsigned pg_bytes = min_t(unsigned, len - copied,
- PAGE_SIZE - pg_offset);
+ unsigned pg_len = min_t(unsigned, len - copied,
+ PAGE_SIZE - pg_offset);
unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
- iter, pg_offset, pg_bytes);
+ iter, pg_offset, pg_len);
if (!pg_copied)
break;
copied -= (offset + copied) & (PAGE_SIZE - 1);
}
}
-out:
- for (i = 0; i < nr_pages_copied; i++) {
- if (!PageUptodate(pages[i]))
- SetPageUptodate(pages[i]);
- if (!PageDirty(pages[i]))
- bch2_set_page_dirty(pages[i]);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ while (set_dirty < copied) {
+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
+ unsigned pg_len = min_t(unsigned, copied - set_dirty,
+ PAGE_SIZE - pg_offset);
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
+ unlock_page(page);
+ put_page(page);
+
+ set_dirty += pg_len;
+ }
+out:
for (i = nr_pages_copied; i < nr_pages; i++) {
- if (!PageDirty(pages[i]))
- bch2_put_page_reservation(c, inode, pages[i]);
unlock_page(pages[i]);
put_page(pages[i]);
}
+ bch2_page_reservation_put(c, inode, &res);
+
return copied ?: ret;
}
/* truncate: */
-static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
- u64 start_offset, u64 end_offset, u64 *journal_seq)
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, struct bch_inode_info *inode,
+ u64 new_i_size)
{
- struct bpos start = POS(inode->v.i_ino, start_offset);
- struct bpos end = POS(inode->v.i_ino, end_offset);
+ struct bch_fs *c = trans->c;
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
- struct btree_trans trans;
- struct btree_iter *iter;
struct bkey_s_c k;
- int ret = 0;
-
- bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
- BTREE_ITER_INTENT);
+ int ret = 0, ret2 = 0;
while ((k = bch2_btree_iter_peek(iter)).k &&
- !(ret = bkey_err(k)) &&
bkey_cmp(iter->pos, end) < 0) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i delete;
+ ret = bkey_err(k);
+ if (ret)
+ goto btree_err;
+
bkey_init(&delete.k);
delete.k.p = iter->pos;
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
- ret = bch2_extent_update(&trans, inode,
+ bch2_trans_begin_updates(trans);
+
+ ret = bch2_extent_update(trans, inode,
&disk_res, NULL, iter, &delete,
- 0, true, true, NULL);
+ new_i_size, false, true, NULL);
bch2_disk_reservation_put(c, &disk_res);
-
- if (ret == -EINTR)
+btree_err:
+ if (ret == -EINTR) {
+ ret2 = ret;
ret = 0;
+ }
if (ret)
break;
+ }
- bch2_trans_cond_resched(&trans);
+ if (bkey_cmp(iter->pos, end) > 0) {
+ bch2_btree_iter_set_pos(iter, end);
+ ret = bch2_btree_iter_traverse(iter);
}
+ return ret ?: ret2;
+}
+
+static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
+ u64 start_offset, u64 end_offset)
+{
+ struct btree_trans trans;
+ struct btree_iter *iter;
+ int ret = 0;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
+
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(inode->v.i_ino, start_offset),
+ BTREE_ITER_INTENT);
+
+ ret = bch2_fpunch_at(&trans, iter,
+ POS(inode->v.i_ino, end_offset),
+ inode, 0);
+
bch2_trans_exit(&trans);
+ if (ret == -EINTR)
+ ret = 0;
+
return ret;
}
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
+ struct bch_page_state *s;
unsigned start_offset = start & (PAGE_SIZE - 1);
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+ unsigned i;
struct page *page;
int ret = 0;
}
}
+ s = bch2_page_state_create(page, 0);
+ if (!s) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
if (!PageUptodate(page)) {
ret = bch2_read_single_page(page, mapping);
if (ret)
goto unlock;
}
+ if (index != start >> PAGE_SHIFT)
+ start_offset = 0;
+ if (index != end >> PAGE_SHIFT)
+ end_offset = PAGE_SIZE;
+
+ for (i = round_up(start_offset, block_bytes(c)) >> 9;
+ i < round_down(end_offset, block_bytes(c)) >> 9;
+ i++) {
+ s->s[i].nr_replicas = 0;
+ s->s[i].state = SECTOR_UNALLOCATED;
+ }
+
+ zero_user_segment(page, start_offset, end_offset);
+
/*
* Bit of a hack - we don't want truncate to fail due to -ENOSPC.
*
* XXX: because we aren't currently tracking whether the page has actual
* data in it (vs. just 0s, or only partially written) this wrong. ick.
*/
- ret = bch2_get_page_reservation(c, inode, page, false);
+ ret = bch2_get_page_disk_reservation(c, inode, page, false);
BUG_ON(ret);
- if (index == start >> PAGE_SHIFT &&
- index == end >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, end_offset);
- else if (index == start >> PAGE_SHIFT)
- zero_user_segment(page, start_offset, PAGE_SIZE);
- else if (index == end >> PAGE_SHIFT)
- zero_user_segment(page, 0, end_offset);
-
- if (!PageDirty(page))
- bch2_set_page_dirty(page);
+ __set_page_dirty_nobuffers(page);
unlock:
unlock_page(page);
put_page(page);
static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
{
return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
- from, from + PAGE_SIZE);
+ from, round_up(from, PAGE_SIZE));
}
static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
truncate_setsize(&inode->v, iattr->ia_size);
- /*
- * XXX: need a comment explaining why PAGE_SIZE and not block_bytes()
- * here:
- */
ret = __bch2_fpunch(c, inode,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- U64_MAX, &inode->ei_journal_seq);
+ round_up(iattr->ia_size, block_bytes(c)) >> 9,
+ U64_MAX);
if (unlikely(ret))
goto err;
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
- u64 discard_start = round_up(offset, PAGE_SIZE) >> 9;
- u64 discard_end = round_down(offset + len, PAGE_SIZE) >> 9;
+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
int ret = 0;
inode_lock(&inode->v);
truncate_pagecache_range(&inode->v, offset, offset + len - 1);
if (discard_start < discard_end)
- ret = __bch2_fpunch(c, inode, discard_start, discard_end,
- &inode->ei_journal_seq);
+ ret = __bch2_fpunch(c, inode, discard_start, discard_end);
err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
while (bkey_cmp(dst->pos,
POS(inode->v.i_ino,
- round_up(new_size, PAGE_SIZE) >> 9)) < 0) {
+ round_up(new_size, block_bytes(c)) >> 9)) < 0) {
struct disk_reservation disk_res;
ret = bch2_btree_iter_traverse(dst);
bch2_cut_front(src->pos, ©.k);
copy.k.k.p.offset -= len >> 9;
- bch2_extent_trim_atomic(©.k, dst);
+ ret = bch2_extent_trim_atomic(©.k, dst);
+ if (ret)
+ goto bkey_err;
BUG_ON(bkey_cmp(dst->pos, bkey_start_pos(©.k.k)));
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
+ bch2_trans_begin_updates(&trans);
+
ret = bch2_extent_update(&trans, inode,
&disk_res, NULL,
dst, ©.k,
ret = __bch2_fpunch(c, inode,
round_up(new_size, block_bytes(c)) >> 9,
- U64_MAX, &inode->ei_journal_seq);
+ U64_MAX);
if (ret)
goto err;
struct btree_trans trans;
struct btree_iter *iter;
struct bpos end_pos;
- loff_t block_start, block_end;
- loff_t end = offset + len;
+ loff_t end = offset + len;
+ loff_t block_start = round_down(offset, block_bytes(c));
+ loff_t block_end = round_up(end, block_bytes(c));
unsigned sectors;
unsigned replicas = io_opts(c, inode).data_replicas;
int ret;
goto err;
truncate_pagecache_range(&inode->v, offset, end - 1);
-
- block_start = round_up(offset, PAGE_SIZE);
- block_end = round_down(end, PAGE_SIZE);
- } else {
- block_start = round_down(offset, PAGE_SIZE);
- block_end = round_up(end, PAGE_SIZE);
}
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
reservation.v.nr_replicas = disk_res.nr_replicas;
}
+ bch2_trans_begin_updates(&trans);
+
ret = bch2_extent_update(&trans, inode,
&disk_res, "a_res,
iter, &reservation.k_i,
return -EOPNOTSUPP;
}
+static void mark_range_unallocated(struct bch_inode_info *inode,
+ loff_t start, loff_t end)
+{
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
+ struct pagevec pvec;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i, j;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+ &index, end_index);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct bch_page_state *s;
+
+ lock_page(page);
+ s = bch2_page_state(page);
+
+ if (s)
+ for (j = 0; j < PAGE_SECTORS; j++)
+ s->s[j].nr_replicas = 0;
+
+ unlock_page(page);
+ }
+ pagevec_release(&pvec);
+ } while (index <= end_index);
+}
+
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
+ struct file *file_dst, loff_t pos_dst,
+ loff_t len, unsigned remap_flags)
+{
+ struct bch_inode_info *src = file_bch_inode(file_src);
+ struct bch_inode_info *dst = file_bch_inode(file_dst);
+ struct bch_fs *c = src->v.i_sb->s_fs_info;
+ loff_t ret = 0;
+ loff_t aligned_len;
+
+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
+ return -EINVAL;
+
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+
+ if ((pos_src & (block_bytes(c) - 1)) ||
+ (pos_dst & (block_bytes(c) - 1)))
+ return -EINVAL;
+
+ if (src == dst &&
+ abs(pos_src - pos_dst) < len)
+ return -EINVAL;
+
+ bch2_lock_inodes(INODE_LOCK, src, dst);
+
+ inode_dio_wait(&src->v);
+ inode_dio_wait(&dst->v);
+
+ __pagecache_block_get(&src->v.i_mapping->add_lock);
+ __pagecache_block_get(&dst->v.i_mapping->add_lock);
+
+ ret = generic_remap_file_range_prep(file_src, pos_src,
+ file_dst, pos_dst,
+ &len, remap_flags);
+ if (ret < 0 || len == 0)
+ goto out_unlock;
+
+ aligned_len = round_up(len, block_bytes(c));
+
+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+ pos_dst, pos_dst + aligned_len);
+ if (ret)
+ goto out_unlock;
+
+ mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+
+ ret = bch2_remap_range(c, dst,
+ POS(dst->v.i_ino, pos_dst >> 9),
+ POS(src->v.i_ino, pos_src >> 9),
+ aligned_len >> 9,
+ pos_dst + len);
+ if (ret > 0)
+ ret = min(ret << 9, len);
+
+out_unlock:
+ __pagecache_block_put(&dst->v.i_mapping->add_lock);
+ __pagecache_block_put(&src->v.i_mapping->add_lock);
+
+ bch2_unlock_inodes(INODE_LOCK, src, dst);
+
+ return ret;
+}
+
/* fseek: */
-static bool page_is_data(struct page *page)
+static int page_data_offset(struct page *page, unsigned offset)
{
struct bch_page_state *s = bch2_page_state(page);
unsigned i;
- if (!s)
- return false;
-
- for (i = 0; i < PAGE_SECTORS; i++)
- if (s->s[i].state >= SECTOR_DIRTY)
- return true;
+ if (s)
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
+ if (s->s[i].state >= SECTOR_DIRTY)
+ return i << 9;
- return false;
+ return -1;
}
-static loff_t bch2_next_pagecache_data(struct inode *vinode,
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
loff_t start_offset,
loff_t end_offset)
{
struct address_space *mapping = vinode->i_mapping;
struct page *page;
- pgoff_t index;
-
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++) {
- if (find_get_pages(mapping, &index, 1, &page)) {
+ pgoff_t start_index = start_offset >> PAGE_SHIFT;
+ pgoff_t end_index = end_offset >> PAGE_SHIFT;
+ pgoff_t index = start_index;
+ loff_t ret;
+ int offset;
+
+ while (index <= end_index) {
+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
lock_page(page);
- if (page_is_data(page))
- end_offset =
- min(end_offset,
- max(start_offset,
- ((loff_t) index) << PAGE_SHIFT));
+ offset = page_data_offset(page,
+ page->index == start_index
+ ? start_offset & (PAGE_SIZE - 1)
+ : 0);
+ if (offset >= 0) {
+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
+ offset,
+ start_offset, end_offset);
+ unlock_page(page);
+ put_page(page);
+ return ret;
+ }
+
unlock_page(page);
put_page(page);
} else {
return ret;
if (next_data > offset)
- next_data = bch2_next_pagecache_data(&inode->v,
+ next_data = bch2_seek_pagecache_data(&inode->v,
offset, next_data);
- if (next_data > isize)
+ if (next_data >= isize)
return -ENXIO;
return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
}
-static bool page_slot_is_data(struct address_space *mapping, pgoff_t index)
+static int __page_hole_offset(struct page *page, unsigned offset)
{
+ struct bch_page_state *s = bch2_page_state(page);
+ unsigned i;
+
+ if (!s)
+ return 0;
+
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
+ if (s->s[i].state < SECTOR_DIRTY)
+ return i << 9;
+
+ return -1;
+}
+
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
struct page *page;
- bool ret;
+ int pg_offset;
+ loff_t ret = -1;
page = find_lock_entry(mapping, index);
if (!page || xa_is_value(page))
- return false;
+ return offset;
+
+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
+ if (pg_offset >= 0)
+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
- ret = page_is_data(page);
unlock_page(page);
return ret;
}
-static loff_t bch2_next_pagecache_hole(struct inode *vinode,
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
loff_t start_offset,
loff_t end_offset)
{
struct address_space *mapping = vinode->i_mapping;
- pgoff_t index;
+ loff_t offset = start_offset, hole;
- for (index = start_offset >> PAGE_SHIFT;
- index < end_offset >> PAGE_SHIFT;
- index++)
- if (!page_slot_is_data(mapping, index))
- end_offset = max(start_offset,
- ((loff_t) index) << PAGE_SHIFT);
+ while (offset < end_offset) {
+ hole = page_hole_offset(mapping, offset);
+ if (hole >= 0 && hole <= end_offset)
+ return max(start_offset, hole);
+
+ offset += PAGE_SIZE;
+ offset &= PAGE_MASK;
+ }
return end_offset;
}
POS(inode->v.i_ino, offset >> 9),
BTREE_ITER_SLOTS, k, ret) {
if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_next_pagecache_hole(&inode->v,
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
offset, MAX_LFS_FILESIZE);
break;
} else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_next_pagecache_hole(&inode->v,
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
max(offset, bkey_start_offset(k.k) << 9),
k.k->p.offset << 9);
#include <linux/uio.h>
+struct quota_res;
+
+int bch2_extent_update(struct btree_trans *,
+ struct bch_inode_info *,
+ struct disk_reservation *,
+ struct quota_res *,
+ struct btree_iter *,
+ struct bkey_i *,
+ u64, bool, bool, s64 *);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+ struct bpos, struct bch_inode_info *, u64);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+ struct bch_inode_info *,
+ loff_t, unsigned);
+
int bch2_writepage(struct page *, struct writeback_control *);
int bch2_readpage(struct file *, struct page *);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
+ loff_t, loff_t, unsigned);
+
loff_t bch2_llseek(struct file *, loff_t, int);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
return 0;
}
-static int bch2_fill_extent(struct fiemap_extent_info *info,
- const struct bkey_i *k, unsigned flags)
+static int bch2_fill_extent(struct bch_fs *c,
+ struct fiemap_extent_info *info,
+ struct bkey_s_c k, unsigned flags)
{
- if (bkey_extent_is_data(&k->k)) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
+ if (bkey_extent_is_data(k.k)) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
int ret;
- extent_for_each_ptr_decode(e, p, entry) {
+ if (k.k->type == KEY_TYPE_reflink_v)
+ flags |= FIEMAP_EXTENT_SHARED;
+
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int flags2 = 0;
u64 offset = p.ptr.offset;
else
offset += p.crc.offset;
- if ((offset & (PAGE_SECTORS - 1)) ||
- (e.k->size & (PAGE_SECTORS - 1)))
+ if ((offset & (c->opts.block_size - 1)) ||
+ (k.k->size & (c->opts.block_size - 1)))
flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
ret = fiemap_fill_next_extent(info,
- bkey_start_offset(e.k) << 9,
+ bkey_start_offset(k.k) << 9,
offset << 9,
- e.k->size << 9, flags|flags2);
+ k.k->size << 9, flags|flags2);
if (ret)
return ret;
}
return 0;
- } else if (k->k.type == KEY_TYPE_reservation) {
+ } else if (k.k->type == KEY_TYPE_reservation) {
return fiemap_fill_next_extent(info,
- bkey_start_offset(&k->k) << 9,
- 0, k->k.size << 9,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
flags|
FIEMAP_EXTENT_DELALLOC|
FIEMAP_EXTENT_UNWRITTEN);
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- BKEY_PADDED(k) tmp;
+ BKEY_PADDED(k) cur, prev;
+ unsigned offset_into_extent, sectors;
bool have_extent = false;
int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(ei->v.i_ino, start >> 9), 0, k, ret)
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(ei->v.i_ino, start >> 9),
+ BTREE_ITER_SLOTS);
+
+ while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) {
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&cur.k, k);
+ k = bkey_i_to_s_c(&cur.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(&trans, iter,
+ &offset_into_extent, &cur.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) + offset_into_extent),
+ &cur.k);
+ bch2_key_resize(&cur.k.k, sectors);
+ cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
+
if (bkey_extent_is_data(k.k) ||
k.k->type == KEY_TYPE_reservation) {
- if (bkey_cmp(bkey_start_pos(k.k),
- POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
- break;
-
if (have_extent) {
- ret = bch2_fill_extent(info, &tmp.k, 0);
+ ret = bch2_fill_extent(c, info,
+ bkey_i_to_s_c(&prev.k), 0);
if (ret)
break;
}
- bkey_reassemble(&tmp.k, k);
+ bkey_copy(&prev.k, &cur.k);
have_extent = true;
}
- if (!ret && have_extent)
- ret = bch2_fill_extent(info, &tmp.k, FIEMAP_EXTENT_LAST);
+ bch2_btree_iter_set_pos(iter,
+ POS(iter->pos.inode,
+ iter->pos.offset + sectors));
+ }
+ if (!ret && have_extent)
+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+ FIEMAP_EXTENT_LAST);
+err:
ret = bch2_trans_exit(&trans) ?: ret;
return ret < 0 ? ret : 0;
}
#ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl,
#endif
+ .remap_file_range = bch2_remap_file_range,
};
static const struct inode_operations bch_file_inode_operations = {
goto out;
}
- /* XXX: blocksize */
- sb->s_blocksize = PAGE_SIZE;
- sb->s_blocksize_bits = PAGE_SHIFT;
+ sb->s_blocksize = block_bytes(c);
+ sb->s_blocksize_bits = ilog2(block_bytes(c));
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &bch_super_operations;
sb->s_export_op = &bch_export_ops;
bch2_verify_keylist_sorted(keys);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
+retry:
+ bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
bkey_start_pos(&bch2_keylist_front(keys)->k),
bkey_copy(&split.k, bch2_keylist_front(keys));
- bch2_extent_trim_atomic(&split.k, iter);
+ ret = bch2_extent_trim_atomic(&split.k, iter);
+ if (ret)
+ break;
bch2_trans_update(&trans,
BTREE_INSERT_ENTRY(iter, &split.k));
bch2_keylist_pop_front(keys);
} while (!bch2_keylist_empty(keys));
+ if (ret == -EINTR) {
+ ret = 0;
+ goto retry;
+ }
+
bch2_trans_exit(&trans);
return ret;
p.ptr.cached = !ca->mi.durability ||
(op->flags & BCH_WRITE_CACHED) != 0;
p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_extent_ptr_decoded_append(e, &p);
+ bch2_extent_ptr_decoded_append(&e->k_i, &p);
BUG_ON(crc.compressed_size > ob->sectors_free);
ob->sectors_free -= crc.compressed_size;
struct bch_io_opts opts,
unsigned flags)
{
- if (!bkey_extent_is_data(k.k))
- return false;
-
if (!(flags & BCH_READ_MAY_PROMOTE))
return false;
if (!opts.promote_target)
return false;
- if (bch2_extent_has_target(c, bkey_s_c_to_extent(k),
- opts.promote_target))
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
return false;
if (bch2_target_congested(c, opts.promote_target)) {
noinline
static struct promote_op *__promote_alloc(struct bch_fs *c,
+ enum btree_id btree_id,
struct bpos pos,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
(struct data_opts) {
.target = opts.promote_target
},
+ btree_id,
bkey_s_c_null);
BUG_ON(ret);
if (!should_promote(c, k, pos, opts, flags))
return NULL;
- promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+ promote = __promote_alloc(c,
+ k.k->type == KEY_TYPE_reflink_v
+ ? BTREE_ID_REFLINK
+ : BTREE_ID_EXTENTS,
+ pos, pick, opts, sectors, rbio);
if (!promote)
return NULL;
k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
- rbio->pick.ptr,
- rbio->pos.offset -
- rbio->pick.crc.offset)) {
+ if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+ rbio->pick.ptr,
+ rbio->pos.offset -
+ rbio->pick.crc.offset)) {
/* extent we wanted to read no longer exists: */
rbio->hole = true;
goto out;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
if (ret == READ_RETRY)
goto retry;
if (ret)
struct bkey_s_c k;
int ret;
- bch2_trans_init(&trans, c, 0, 0);
-
flags &= ~BCH_READ_LAST_FRAGMENT;
flags |= BCH_READ_MUST_CLONE;
+
+ bch2_trans_init(&trans, c, 0, 0);
retry:
+ bch2_trans_begin(&trans);
+
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS, k, ret) {
BKEY_PADDED(k) tmp;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(&trans, iter,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ break;
+
+ sectors = min(sectors, k.k->size - offset_into_extent);
+
bch2_trans_unlock(&trans);
- bytes = min_t(unsigned, bvec_iter.bi_size,
- (k.k->p.offset - bvec_iter.bi_sector) << 9);
+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, failed, flags);
+ ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+ offset_into_extent, failed, flags);
switch (ret) {
case READ_RETRY:
goto retry;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_i_extent *e;
BKEY_PADDED(k) new;
struct bch_extent_crc_unpacked new_crc;
u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
if (IS_ERR_OR_NULL(k.k))
goto out;
- if (!bkey_extent_is_data(k.k))
- goto out;
-
bkey_reassemble(&new.k, k);
- e = bkey_i_to_extent(&new.k);
+ k = bkey_i_to_s_c(&new.k);
- if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
- rbio->pick.ptr, data_offset) ||
- bversion_cmp(e->k.version, rbio->version))
+ if (bversion_cmp(k.k->version, rbio->version) ||
+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
/* Extent was merged? */
- if (bkey_start_offset(&e->k) < data_offset ||
- e->k.p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+ if (bkey_start_offset(k.k) < data_offset ||
+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
goto out;
if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
rbio->pick.crc, NULL, &new_crc,
- bkey_start_offset(&e->k) - data_offset, e->k.size,
+ bkey_start_offset(k.k) - data_offset, k.k->size,
rbio->pick.crc.csum_type)) {
bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
goto out;
}
- if (!bch2_extent_narrow_crcs(e, new_crc))
+ if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
goto out;
- bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &e->k_i));
+ bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k));
ret = bch2_trans_commit(&trans, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
bch2_trans_exit(&trans);
}
-static bool should_narrow_crcs(struct bkey_s_c k,
- struct extent_ptr_decoded *pick,
- unsigned flags)
-{
- return !(flags & BCH_READ_IN_RETRY) &&
- bkey_extent_is_data(k.k) &&
- bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
-}
-
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
goto nodecode;
/* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+ crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
if (crc.compression_type != BCH_COMPRESSION_NONE) {
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
}
+int bch2_read_indirect_extent(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ unsigned *offset_into_extent,
+ struct bkey_i *orig_k)
+{
+ struct btree_iter *iter;
+ struct bkey_s_c k;
+ u64 reflink_offset;
+ int ret;
+
+ if (orig_k->k.type != KEY_TYPE_reflink_p)
+ return 0;
+
+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
+ *offset_into_extent;
+
+ iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
+ POS(0, reflink_offset),
+ BTREE_ITER_SLOTS, 1);
+ ret = PTR_ERR_OR_ZERO(iter);
+ if (ret)
+ return ret;
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ if (k.k->type != KEY_TYPE_reflink_v) {
+ __bcache_io_error(trans->c,
+ "pointer to nonexistent indirect extent");
+ ret = -EIO;
+ goto err;
+ }
+
+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+ bkey_reassemble(orig_k, k);
+err:
+ bch2_trans_iter_put(trans, iter);
+ return ret;
+}
+
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
+ unsigned offset_into_extent,
struct bch_io_failures *failed, unsigned flags)
{
struct extent_ptr_decoded pick;
if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
goto hole;
- iter.bi_sector = pos.offset;
iter.bi_size = pick.crc.compressed_size << 9;
goto noclone;
}
bio_flagged(&orig->bio, BIO_CHAIN))
flags |= BCH_READ_MUST_CLONE;
- narrow_crcs = should_narrow_crcs(k, &pick, flags);
+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+ bch2_can_narrow_extent_crcs(k, pick.crc);
if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
flags |= BCH_READ_MUST_BOUNCE;
- EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
- k.k->p.offset < bvec_iter_end_sector(iter));
+ BUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
bvec_iter_sectors(iter) != pick.crc.live_size ||
pick.crc.offset ||
- iter.bi_sector != pos.offset));
+ offset_into_extent));
+ pos.offset += offset_into_extent;
pick.ptr.offset += pick.crc.offset +
- (iter.bi_sector - pos.offset);
+ offset_into_extent;
+ offset_into_extent = 0;
pick.crc.compressed_size = bvec_iter_sectors(iter);
pick.crc.uncompressed_size = bvec_iter_sectors(iter);
pick.crc.offset = 0;
pick.crc.live_size = bvec_iter_sectors(iter);
- pos.offset = iter.bi_sector;
+ offset_into_extent = 0;
}
if (rbio) {
else
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
+ rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
rbio->narrow_crcs = narrow_crcs;
rbio->c = c;
rbio->start_time = local_clock();
- for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
- POS(inode, rbio->bio.bi_iter.bi_sector),
- BTREE_ITER_SLOTS, k, ret) {
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ POS(inode, rbio->bio.bi_iter.bi_sector),
+ BTREE_ITER_SLOTS);
+
+ while (1) {
BKEY_PADDED(k) tmp;
- unsigned bytes;
+ unsigned bytes, sectors, offset_into_extent;
+
+ bch2_btree_iter_set_pos(iter,
+ POS(inode, rbio->bio.bi_iter.bi_sector));
+
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&tmp.k, k);
+ k = bkey_i_to_s_c(&tmp.k);
+
+ offset_into_extent = iter->pos.offset -
+ bkey_start_offset(k.k);
+ sectors = k.k->size - offset_into_extent;
+
+ ret = bch2_read_indirect_extent(&trans, iter,
+ &offset_into_extent, &tmp.k);
+ if (ret)
+ goto err;
+
+ /*
+ * With indirect extents, the amount of data to read is the min
+ * of the original extent and the indirect extent:
+ */
+ sectors = min(sectors, k.k->size - offset_into_extent);
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
- bkey_reassemble(&tmp.k, k);
- k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
- (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- bch2_read_extent(c, rbio, k, flags);
+ bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
- return;
+ break;
swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes);
}
-
- /*
- * If we get here, it better have been because there was an error
- * reading a btree node
- */
- BUG_ON(!ret);
- bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
-
+out:
bch2_trans_exit(&trans);
+ return;
+err:
+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio);
+ goto out;
}
void bch2_fs_io_exit(struct bch_fs *c)
struct cache_promote_op;
struct extent_ptr_decoded;
-int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- struct bkey_s_c, struct bch_io_failures *, unsigned);
-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *,
+ unsigned *, struct bkey_i *);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_IN_RETRY = 1 << 7,
};
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *,
+ struct bvec_iter, struct bkey_s_c, unsigned,
+ struct bch_io_failures *, unsigned);
+
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
struct bkey_s_c k,
+ unsigned offset_into_extent,
unsigned flags)
{
- __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k,
+ offset_into_extent, NULL, flags);
}
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
{
*/
struct bvec_iter bvec_iter;
+ unsigned offset_into_extent;
+
u16 flags;
union {
struct {
return 0;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
+ enum btree_id btree_id)
{
struct btree_trans trans;
struct btree_iter *iter;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS_MIN, BTREE_ITER_PREFETCH);
+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+ BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) {
- if (!bkey_extent_is_data(k.k) ||
- !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
+ if (!bch2_bkey_has_device(k, dev_idx)) {
ret = bch2_mark_bkey_replicas(c, k);
if (ret)
break;
return ret;
}
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
+}
+
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans trans;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- struct bkey_i_extent *insert, *new =
+ struct bkey_i *insert;
+ struct bkey_i_extent *new =
bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry;
break;
if (bversion_cmp(k.k->version, new->k.version) ||
- !bkey_extent_is_data(k.k) ||
- !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
- m->ptr, m->offset))
+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
goto nomatch;
if (m->data_cmd == DATA_REWRITE &&
- !bch2_extent_has_device(bkey_s_c_to_extent(k),
- m->data_opts.rewrite_dev))
+ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
goto nomatch;
bkey_reassemble(&_insert.k, k);
- insert = bkey_i_to_extent(&_insert.k);
+ insert = &_insert.k;
bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k);
- bch2_cut_front(iter->pos, &insert->k_i);
+ bch2_cut_front(iter->pos, insert);
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
if (m->data_cmd == DATA_REWRITE)
- bch2_bkey_drop_device(extent_i_to_s(insert).s,
+ bch2_bkey_drop_device(bkey_i_to_s(insert),
m->data_opts.rewrite_dev);
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
- if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) {
+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
/*
* raced with another move op? extent already
* has a pointer to the device we just wrote
if (!did_work)
goto nomatch;
- bch2_extent_narrow_crcs(insert,
+ bch2_bkey_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, extent_i_to_s(insert).s);
- bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
- op->opts.background_target,
- op->opts.data_replicas);
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
+ op->opts.background_target,
+ op->opts.data_replicas);
/*
* If we're not fully overwriting @k, and it's compressed, we
* need a reservation for all the pointers in @insert
*/
- nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
+ nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
m->nr_ptrs_reserved;
if (insert->k.size < k.k->size &&
}
bch2_trans_update(&trans,
- BTREE_INSERT_ENTRY(iter, &insert->k_i));
+ BTREE_INSERT_ENTRY(iter, insert));
ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op),
struct bch_io_opts io_opts,
enum data_cmd data_cmd,
struct data_opts data_opts,
+ enum btree_id btree_id,
struct bkey_s_c k)
{
int ret;
+ m->btree_id = btree_id;
m->data_cmd = data_cmd;
m->data_opts = data_opts;
m->nr_ptrs_reserved = 0;
break;
}
case DATA_REWRITE: {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned compressed_sectors = 0;
- extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
p.crc.compression_type != BCH_COMPRESSION_NONE &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
struct moving_context *ctxt,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
- struct bkey_s_c_extent e,
+ enum btree_id btree_id,
+ struct bkey_s_c k,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- unsigned sectors = e.k->size, pages;
+ unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
move_ctxt_wait_event(ctxt,
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
goto err;
io->write.ctxt = ctxt;
- io->read_sectors = e.k->size;
- io->write_sectors = e.k->size;
+ io->read_sectors = k.k->size;
+ io->write_sectors = k.k->size;
bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->write.op.wbio.bio,
io->rbio.bio.bi_iter.bi_size = sectors << 9;
bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
- data_cmd, data_opts, e.s_c);
+ data_cmd, data_opts, btree_id, k);
if (ret)
goto err_free_pages;
atomic64_inc(&ctxt->stats->keys_moved);
- atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
- trace_move_extent(e.k);
+ trace_move_extent(k.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(c, &io->rbio, e.s_c,
+ bch2_read_extent(c, &io->rbio, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
err_free:
kfree(io);
err:
- trace_move_alloc_fail(e.k);
+ trace_move_alloc_fail(k.k);
return ret;
}
-int bch2_move_data(struct bch_fs *c,
- struct bch_ratelimit *rate,
- struct write_point_specifier wp,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- struct bch_move_stats *stats)
+static int __bch2_move_data(struct bch_fs *c,
+ struct moving_context *ctxt,
+ struct bch_ratelimit *rate,
+ struct write_point_specifier wp,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ struct bch_move_stats *stats,
+ enum btree_id btree_id)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
- struct moving_context ctxt = { .stats = stats };
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp;
struct btree_trans trans;
u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2;
- closure_init_stack(&ctxt.cl);
- INIT_LIST_HEAD(&ctxt.reads);
- init_waitqueue_head(&ctxt.wait);
-
bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_USER;
- stats->btree_id = BTREE_ID_EXTENTS;
+ stats->btree_id = btree_id;
stats->pos = POS_MIN;
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
+ iter = bch2_trans_get_iter(&trans, btree_id, start,
BTREE_ITER_PREFETCH);
if (rate)
if (unlikely(freezing(current))) {
bch2_trans_unlock(&trans);
- move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
- ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
- bkey_s_c_to_extent(k),
+ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts);
if (ret2) {
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
+ bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
bch2_trans_cond_resched(&trans);
}
out:
- bch2_trans_exit(&trans);
+ ret = bch2_trans_exit(&trans) ?: ret;
+
+ return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+ struct bch_ratelimit *rate,
+ struct write_point_specifier wp,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ struct bch_move_stats *stats)
+{
+ struct moving_context ctxt = { .stats = stats };
+ int ret;
+
+ closure_init_stack(&ctxt.cl);
+ INIT_LIST_HEAD(&ctxt.reads);
+ init_waitqueue_head(&ctxt.wait);
+
+ stats->data_type = BCH_DATA_USER;
+
+ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
+ pred, arg, stats, BTREE_ID_EXTENTS) ?:
+ __bch2_move_data(c, &ctxt, rate, wp, start, end,
+ pred, arg, stats, BTREE_ID_REFLINK);
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
};
struct migrate_write {
+ enum btree_id btree_id;
enum data_cmd data_cmd;
struct data_opts data_opts;
struct write_point_specifier,
struct bch_io_opts,
enum data_cmd, struct data_opts,
- struct bkey_s_c);
+ enum btree_id, struct bkey_s_c);
typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
struct bkey_s_c,
struct bkey_s_c k)
{
copygc_heap *h = &ca->copygc_heap;
+ const struct bch_extent_ptr *ptr =
+ bch2_bkey_has_device(k, ca->dev_idx);
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const struct bch_extent_ptr *ptr =
- bch2_extent_has_device(e, ca->dev_idx);
+ if (ptr) {
+ struct copygc_heap_entry search = { .offset = ptr->offset };
- if (ptr) {
- struct copygc_heap_entry search = { .offset = ptr->offset };
+ ssize_t i = eytzinger0_find_le(h->data, h->used,
+ sizeof(h->data[0]),
+ bucket_offset_cmp, &search);
- ssize_t i = eytzinger0_find_le(h->data, h->used,
- sizeof(h->data[0]),
- bucket_offset_cmp, &search);
-
- return (i >= 0 &&
- ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
- ptr->gen == h->data[i].gen);
- }
- break;
- }
+ return (i >= 0 &&
+ ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+ ptr->gen == h->data[i].gen);
}
return false;
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c_extent e;
if (!bkey_extent_is_data(k.k))
return;
!io_opts->background_compression)
return;
- e = bkey_s_c_to_extent(k);
-
- extent_for_each_ptr_decode(e, p, entry)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (rebalance_ptr_pred(c, p, io_opts)) {
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
-static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
+ struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter, *split_iter;
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i *split;
+ struct bpos atomic_end;
bool split_compressed = false;
int ret;
retry:
bch2_trans_begin(&trans);
- iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
+ iter = bch2_trans_get_iter(&trans, btree_id,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
if (ret)
goto err;
+ ret = bch2_extent_atomic_end(&trans, split_iter,
+ k, &atomic_end);
+ if (ret)
+ goto err;
+
if (!split_compressed &&
bch2_extent_is_compressed(bkey_i_to_s_c(k)) &&
- !bch2_extent_is_atomic(k, split_iter)) {
+ bkey_cmp(atomic_end, k->k.p) < 0) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
bkey_copy(split, k);
bch2_cut_front(split_iter->pos, split);
- bch2_extent_trim_atomic(split, split_iter);
+ bch2_cut_back(atomic_end, &split->k);
bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split));
bch2_btree_iter_set_pos(iter, split->k.p);
if (split_compressed) {
ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k),
- -((s64) k->k.size),
+ 0, -((s64) k->k.size),
BCH_BUCKET_MARK_OVERWRITE) ?:
bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_ATOMIC|
for_each_journal_key(keys, i) {
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
- switch (i->btree_id) {
- case BTREE_ID_ALLOC:
+ if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
- break;
- case BTREE_ID_EXTENTS:
- ret = bch2_extent_replay_key(c, i->k);
- break;
- default:
+ else if (btree_node_type_is_extents(i->btree_id))
+ ret = bch2_extent_replay_key(c, i->btree_id, i->k);
+ else
ret = bch2_btree_insert(c, i->btree_id, i->k,
NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK);
- break;
- }
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "extents.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "reflink.h"
+
+#include <linux/sched/signal.h>
+
+/* reflink pointers */
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ if (bkey_val_bytes(p.k) != sizeof(*p.v))
+ return "incorrect value size";
+
+ return NULL;
+}
+
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+
+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+}
+
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
+ struct bkey_s _l, struct bkey_s _r)
+{
+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
+
+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
+ return BCH_MERGE_NOMERGE;
+
+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+ bch2_key_resize(l.k, KEY_SIZE_MAX);
+ __bch2_cut_front(l.k->p, _r);
+ return BCH_MERGE_PARTIAL;
+ }
+
+ bch2_key_resize(l.k, l.k->size + r.k->size);
+
+ return BCH_MERGE_MERGE;
+}
+
+/* indirect extents */
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ if (bkey_val_bytes(r.k) < sizeof(*r.v))
+ return "incorrect value size";
+
+ return bch2_bkey_ptrs_invalid(c, k);
+}
+
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
+
+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
+
+ bch2_bkey_ptrs_to_text(out, c, k);
+}
+
+/*
+ * bch2_remap_range() depends on bch2_extent_update(), which depends on various
+ * things tied to the linux vfs for inode updates, for now:
+ */
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_make_extent_indirect(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct bkey_i_extent *e)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *reflink_iter;
+ struct bkey_s_c k;
+ struct bkey_i_reflink_v *r_v;
+ struct bkey_i_reflink_p *r_p;
+ int ret;
+
+ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
+ POS(0, c->reflink_hint),
+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
+ if (reflink_iter->pos.inode) {
+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+ continue;
+ }
+
+ if (bkey_deleted(k.k) && e->k.size <= k.k->size)
+ break;
+ }
+
+ if (ret)
+ goto err;
+
+ /* rewind iter to start of hole, if necessary: */
+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
+
+ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
+ ret = PTR_ERR_OR_ZERO(r_v);
+ if (ret)
+ goto err;
+
+ bkey_reflink_v_init(&r_v->k_i);
+ r_v->k.p = reflink_iter->pos;
+ bch2_key_resize(&r_v->k, e->k.size);
+ r_v->k.version = e->k.version;
+
+ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
+ bkey_val_u64s(&e->k));
+ r_v->v.refcount = 0;
+ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
+
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
+
+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
+ if (IS_ERR(r_p))
+ return PTR_ERR(r_p);
+
+ e->k.type = KEY_TYPE_reflink_p;
+ r_p = bkey_i_to_reflink_p(&e->k_i);
+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
+
+ bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
+err:
+ if (!IS_ERR(reflink_iter)) {
+ c->reflink_hint = reflink_iter->pos.offset;
+ bch2_trans_iter_put(trans, reflink_iter);
+ }
+
+ return ret;
+}
+
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+{
+ struct bkey_s_c k = bch2_btree_iter_peek(iter);
+
+ while (1) {
+ if (bkey_err(k))
+ return k;
+
+ if (bkey_cmp(iter->pos, end) >= 0)
+ return bkey_s_c_null;
+
+ if (k.k->type == KEY_TYPE_extent ||
+ k.k->type == KEY_TYPE_reflink_p)
+ return k;
+
+ k = bch2_btree_iter_next(iter);
+ }
+}
+
+s64 bch2_remap_range(struct bch_fs *c,
+ struct bch_inode_info *dst_inode,
+ struct bpos dst_start, struct bpos src_start,
+ u64 remap_sectors, u64 new_i_size)
+{
+ struct btree_trans trans;
+ struct btree_iter *dst_iter, *src_iter;
+ struct bkey_s_c src_k;
+ BKEY_PADDED(k) new_dst, new_src;
+ struct bpos dst_end = dst_start, src_end = src_start;
+ struct bpos dst_want, src_want;
+ u64 src_done, dst_done;
+ int ret = 0;
+
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+ mutex_lock(&c->sb_lock);
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
+ c->disk_sb.sb->features[0] |=
+ cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
+
+ bch2_write_super(c);
+ }
+ mutex_unlock(&c->sb_lock);
+ }
+
+ dst_end.offset += remap_sectors;
+ src_end.offset += remap_sectors;
+
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+
+ src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
+ BTREE_ITER_INTENT, 1);
+ dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
+ BTREE_ITER_INTENT, 2);
+
+ while (1) {
+ bch2_trans_begin_updates(&trans);
+ trans.mem_top = 0;
+
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ goto err;
+ }
+
+ src_k = get_next_src(src_iter, src_end);
+ ret = bkey_err(src_k);
+ if (ret)
+ goto btree_err;
+
+ src_done = bpos_min(src_iter->pos, src_end).offset -
+ src_start.offset;
+ dst_want = POS(dst_start.inode, dst_start.offset + src_done);
+
+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
+ dst_inode, new_i_size);
+ if (ret)
+ goto btree_err;
+ continue;
+ }
+
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
+
+ if (!bkey_cmp(dst_iter->pos, dst_end))
+ break;
+
+ if (src_k.k->type == KEY_TYPE_extent) {
+ bkey_reassemble(&new_src.k, src_k);
+ src_k = bkey_i_to_s_c(&new_src.k);
+
+ bch2_cut_front(src_iter->pos, &new_src.k);
+ bch2_cut_back(src_end, &new_src.k.k);
+
+ ret = bch2_make_extent_indirect(&trans, src_iter,
+ bkey_i_to_extent(&new_src.k));
+ if (ret)
+ goto btree_err;
+
+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
+ }
+
+ if (src_k.k->type == KEY_TYPE_reflink_p) {
+ struct bkey_s_c_reflink_p src_p =
+ bkey_s_c_to_reflink_p(src_k);
+ struct bkey_i_reflink_p *dst_p =
+ bkey_reflink_p_init(&new_dst.k);
+
+ u64 offset = le64_to_cpu(src_p.v->idx) +
+ (src_iter->pos.offset -
+ bkey_start_offset(src_k.k));
+
+ dst_p->v.idx = cpu_to_le64(offset);
+ } else {
+ BUG();
+ }
+
+ new_dst.k.k.p = dst_iter->pos;
+ bch2_key_resize(&new_dst.k.k,
+ min(src_k.k->p.offset - src_iter->pos.offset,
+ dst_end.offset - dst_iter->pos.offset));
+
+ ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
+ dst_iter, &new_dst.k,
+ new_i_size, false, true, NULL);
+ if (ret)
+ goto btree_err;
+
+ dst_done = dst_iter->pos.offset - dst_start.offset;
+ src_want = POS(src_start.inode, src_start.offset + dst_done);
+ bch2_btree_iter_set_pos(src_iter, src_want);
+btree_err:
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret)
+ goto err;
+ }
+
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
+err:
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+
+ dst_done = dst_iter->pos.offset - dst_start.offset;
+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+
+ ret = bch2_trans_exit(&trans) ?: ret;
+
+ mutex_lock(&dst_inode->ei_update_lock);
+ if (dst_inode->v.i_size < new_i_size) {
+ i_size_write(&dst_inode->v, new_i_size);
+ ret = bch2_write_inode_size(c, dst_inode, new_i_size,
+ ATTR_MTIME|ATTR_CTIME);
+ }
+ mutex_unlock(&dst_inode->ei_update_lock);
+
+ return dst_done ?: ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_H
+#define _BCACHEFS_REFLINK_H
+
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
+ struct bkey_s, struct bkey_s);
+
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
+ .key_invalid = bch2_reflink_p_invalid, \
+ .val_to_text = bch2_reflink_p_to_text, \
+ .key_merge = bch2_reflink_p_merge, \
+}
+
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
+ struct bkey_s_c);
+
+
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
+ .key_invalid = bch2_reflink_v_invalid, \
+ .val_to_text = bch2_reflink_v_to_text, \
+}
+
+#ifndef NO_BCACHEFS_FS
+s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
+ struct bpos, struct bpos, u64, u64);
+#endif /* NO_BCACHEFS_FS */
+
+#endif /* _BCACHEFS_REFLINK_H */
extent_to_replicas(k, e);
break;
case KEY_TYPE_extent:
+ case KEY_TYPE_reflink_v:
e->data_type = BCH_DATA_USER;
extent_to_replicas(k, e);
break;