-fc4f1d59cf9330bbb27cd12c459706aa5e7fe33c
+9e76e8d98c52c128641b0f916a1990a37d60d22e
x(btree_node_sort) \
x(btree_node_read) \
x(btree_gc) \
- x(btree_update) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
/* misc: */
BCH_FS_BDEV_MOUNTED,
BCH_FS_FIXED_GENS,
+ BCH_FS_ALLOC_WRITTEN,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
const struct bkey_packed *src)
{
- dst->k = bkey_unpack_key(b, src);
+ __bkey_unpack_key(b, &dst->k, src);
memcpy_u64s(&dst->v,
bkeyp_val(&b->format, src),
(u64 *) (_dst) < (u64 *) (_src) + \
((struct bkey *) (_src))->u64s); \
\
- __memmove_u64s_down((_dst), (_src), \
- ((struct bkey *) (_src))->u64s); \
+ memcpy_u64s_small((_dst), (_src), \
+ ((struct bkey *) (_src))->u64s); \
} while (0)
struct btree;
* So we've got to search for start_of_range, then after the lookup iterate
* past any extents that compare equal to the position we searched for.
*/
+__flatten
void bch2_btree_node_iter_init(struct btree_node_iter *iter,
struct btree *b, struct bpos *search)
{
struct bset_tree *t;
struct bkey_packed p, *packed_search = NULL;
+ struct btree_node_iter_set *pos = iter->data;
EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0);
bset_aux_tree_verify(b);
return;
}
- for_each_bset(b, t)
- __bch2_btree_node_iter_push(iter, b,
- bch2_bset_search(b, t, search,
- packed_search, &p),
- btree_bkey_last(b, t));
+ for_each_bset(b, t) {
+ struct bkey_packed *k = bch2_bset_search(b, t, search,
+ packed_search, &p);
+ struct bkey_packed *end = btree_bkey_last(b, t);
+
+ if (k != end)
+ *pos++ = (struct btree_node_iter_set) {
+ __btree_node_key_to_offset(b, k),
+ __btree_node_key_to_offset(b, end)
+ };
+ }
bch2_btree_node_iter_sort(iter, b);
}
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
goto err;
- memset(&b->data->csum, 0, sizeof b->data->csum);
- b->data->flags = 0;
-
bc->used++;
list_move(&b->list, &bc->freeable);
return;
wbio->data = data;
wbio->wbio.order = order;
wbio->wbio.used_mempool = used_mempool;
- wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
+ if (b->level || !b->written)
+ wbio->wbio.bio.bi_opf |= REQ_FUA;
+
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
{
- unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
- unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
+ unsigned total_u64s = bset_u64s(t);
+ unsigned dead_u64s = total_u64s - b->nr.bset_u64s[t - b->set];
- return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
}
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
/* Btree node locking: */
-/*
- * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
- * succeed:
- */
void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
{
- struct btree_iter *linked;
-
- EBUG_ON(iter->l[b->level].b != b);
- EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
-
- trans_for_each_iter_with_node(iter->trans, b, linked)
- linked->l[b->level].lock_seq += 2;
-
- six_unlock_write(&b->lock);
+ bch2_btree_node_unlock_write_inlined(b, iter);
}
void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
__flatten
static bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
{
- return iter->uptodate >= BTREE_ITER_NEED_RELOCK
- ? btree_iter_get_locks(iter, false, trace)
- : true;
+ return btree_iter_get_locks(iter, false, trace);
}
bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
}
+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_node_iter *node_iter = &iter->l[0].iter;
+
+ if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
+ bkey_disassemble(b, where, &iter->k);
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+ }
+}
+
+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+ struct btree *b,
+ struct bkey_packed *where)
+{
+ struct btree_iter *linked;
+
+ trans_for_each_iter_with_node(iter->trans, b, linked) {
+ __bch2_btree_iter_fix_key_modified(linked, b, where);
+ __bch2_btree_iter_verify(linked, b);
+ }
+}
+
static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
btree_node_unlock(iter, iter->level);
}
-static inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter)
{
struct bch_fs *c = iter->trans->c;
struct btree_iter_level *l = &iter->l[iter->level];
enum six_lock_type lock_type = __btree_lock_want(iter, level);
BKEY_PADDED(k) tmp;
- BUG_ON(!btree_node_locked(iter, iter->level));
+ EBUG_ON(!btree_node_locked(iter, iter->level));
bch2_bkey_unpack(l->b, &tmp.k,
bch2_btree_node_iter_peek(&l->iter, l->b));
if (unlikely(iter->level >= BTREE_MAX_DEPTH))
return 0;
- if (bch2_btree_iter_relock(iter, false))
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
+ bch2_btree_iter_relock(iter, false);
+
+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
/*
/* Iterate over iters within a transaction: */
+#define trans_for_each_iter_all(_trans, _iter) \
+ for (_iter = (_trans)->iters; \
+ _iter < (_trans)->iters + (_trans)->nr_iters; \
+ _iter++)
+
static inline struct btree_iter *
__trans_next_iter(struct btree_trans *trans, unsigned idx)
{
static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
#endif
+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
+ struct bkey_packed *);
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_packed *,
unsigned, unsigned);
__bch2_btree_node_relock(iter, level);
}
+/*
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
+ * succeed:
+ */
+static inline void
+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+{
+ struct btree_iter *linked;
+
+ EBUG_ON(iter->l[b->level].b != b);
+ EBUG_ON(iter->l[b->level].lock_seq + 1 != b->lock.state.seq);
+
+ trans_for_each_iter_with_node(iter->trans, b, linked)
+ linked->l[b->level].lock_seq += 2;
+
+ six_unlock_write(&b->lock);
+}
+
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
struct btree_trans {
struct bch_fs *c;
unsigned long ip;
- u64 commit_start;
u64 iters_linked;
u64 iters_live;
struct disk_reservation *disk_res;
unsigned flags;
unsigned journal_u64s;
+ struct replicas_delta_list *fs_usage_deltas;
struct btree_iter iters_onstack[2];
struct btree_insert_entry updates_onstack[6];
u8 updates_sorted_onstack[6];
-
- struct replicas_delta_list *fs_usage_deltas;
};
#define BTREE_FLAG(flag) \
__btree_node_offset_to_key(_b, (_t)->end_offset); \
})
+static inline unsigned bset_u64s(struct bset_tree *t)
+{
+ return t->end_offset - t->data_offset -
+ sizeof(struct bset) / sizeof(u64);
+}
+
static inline unsigned bset_byte_offset(struct btree *b, void *i)
{
return i - (void *) b->data;
return btree_node_type_is_extents(btree_node_type(b));
}
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_ALLOC)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK)| \
+ (1U << BKEY_TYPE_EC)| \
+ (1U << BKEY_TYPE_BTREE))
+
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
+ ((1U << BKEY_TYPE_EXTENTS)| \
+ (1U << BKEY_TYPE_INODES)| \
+ (1U << BKEY_TYPE_REFLINK))
+
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
- switch (type) {
- case BKEY_TYPE_ALLOC:
- case BKEY_TYPE_BTREE:
- case BKEY_TYPE_EXTENTS:
- case BKEY_TYPE_INODES:
- case BKEY_TYPE_EC:
- case BKEY_TYPE_REFLINK:
- return true;
- default:
- return false;
- }
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
}
struct btree_root {
__BTREE_INSERT_JOURNAL_RESERVED,
__BTREE_INSERT_NOMARK_OVERWRITES,
__BTREE_INSERT_NOMARK,
- __BTREE_INSERT_MARK_INMEM,
__BTREE_INSERT_NO_CLEAR_REPLICAS,
__BTREE_INSERT_BUCKET_INVALIDATE,
__BTREE_INSERT_NOWAIT,
/* Don't call mark new key at all: */
#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
-/* Don't mark transactionally: */
-#define BTREE_INSERT_MARK_INMEM (1 << __BTREE_INSERT_MARK_INMEM)
-
#define BTREE_INSERT_NO_CLEAR_REPLICAS (1 << __BTREE_INSERT_NO_CLEAR_REPLICAS)
#define BTREE_INSERT_BUCKET_INVALIDATE (1 << __BTREE_INSERT_BUCKET_INVALIDATE)
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_btree_ptr *);
-int bch2_trans_commit(struct btree_trans *,
- struct disk_reservation *,
- u64 *, unsigned);
+int __bch2_trans_commit(struct btree_trans *);
+
+/**
+ * bch2_trans_commit - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ * if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+static inline int bch2_trans_commit(struct btree_trans *trans,
+ struct disk_reservation *disk_res,
+ u64 *journal_seq,
+ unsigned flags)
+{
+ trans->disk_res = disk_res;
+ trans->journal_seq = journal_seq;
+ trans->flags = flags;
+
+ return __bch2_trans_commit(trans);
+}
static inline void bch2_trans_update(struct btree_trans *trans,
struct btree_iter *iter,
bch2_bset_init_first(b, &b->data->keys);
bch2_btree_build_aux_trees(b);
+ b->data->flags = 0;
b->data->min_key = POS_MIN;
b->data->max_key = POS_MAX;
b->data->format = bch2_btree_calc_format(b);
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
- struct bset *i = btree_bset_last(b);
+ struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
- if (unlikely(bset_written(b, i))) {
+ if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
return bne;
} else {
- if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
return bne;
}
#include "keylist.h"
#include "replicas.h"
+#include <linux/prefetch.h>
#include <linux/sort.h>
#include <trace/events/bcachefs.h>
static inline bool same_leaf_as_prev(struct btree_trans *trans,
- unsigned sorted_idx)
+ unsigned idx)
{
- struct btree_insert_entry *i = trans->updates +
- trans->updates_sorted[sorted_idx];
- struct btree_insert_entry *prev = sorted_idx
- ? trans->updates + trans->updates_sorted[sorted_idx - 1]
- : NULL;
-
- return prev &&
- i->iter->l[0].b == prev->iter->l[0].b;
+ return idx &&
+ trans->updates[trans->updates_sorted[idx]].iter->l[0].b ==
+ trans->updates[trans->updates_sorted[idx - 1]].iter->l[0].b;
}
#define trans_for_each_update_sorted(_trans, _i, _iter) \
bch2_btree_init_next(c, b, iter);
}
-static void btree_trans_lock_write(struct btree_trans *trans, bool lock)
-{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned iter;
-
- trans_for_each_update_sorted(trans, i, iter) {
- if (same_leaf_as_prev(trans, iter))
- continue;
-
- if (lock)
- bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
- else
- bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
- }
-}
-
static inline void btree_trans_sort_updates(struct btree_trans *trans)
{
struct btree_insert_entry *l, *r;
trans->updates_sorted[pos] = l - trans->updates;
nr++;
}
-
- BUG_ON(nr != trans->nr_updates);
}
/* Inserting into a given leaf node (last stage of insert): */
EBUG_ON(insert->k->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
- if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
- insert->k))
+ if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter,
+ insert->k)))
bch2_btree_journal_key(trans, iter, insert->k);
}
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
- int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+ struct bset_tree *t = bset_tree_last(b);
+ int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
bch2_insert_fixup_extent(trans, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
- u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+ u64s_added = (int) bset_u64s(t) - old_u64s;
if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id));
}
-static int bch2_trans_journal_preres_get(struct btree_trans *trans)
+static noinline int
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned u64s = 0;
int ret;
- trans_for_each_update(trans, i)
- if (0)
- u64s += jset_u64s(i->k->k.u64s);
-
- if (!u64s)
- return 0;
-
- ret = bch2_journal_preres_get(&c->journal,
- &trans->journal_preres, u64s,
- JOURNAL_RES_GET_NONBLOCK);
- if (ret != -EAGAIN)
- return ret;
-
bch2_trans_unlock(trans);
ret = bch2_journal_preres_get(&c->journal,
return 0;
}
-static int bch2_trans_journal_res_get(struct btree_trans *trans,
- unsigned flags)
+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
int ret;
return BTREE_INSERT_OK;
}
-static int btree_trans_check_can_insert(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
-{
- struct btree_insert_entry *i;
- unsigned iter, u64s = 0;
- int ret;
-
- trans_for_each_update_sorted(trans, i, iter) {
- /* Multiple inserts might go to same leaf: */
- if (!same_leaf_as_prev(trans, iter))
- u64s = 0;
-
- u64s += i->k->k.u64s;
- ret = btree_key_can_insert(trans, i, &u64s);
- if (ret) {
- *stopped_at = i;
- return ret;
- }
- }
-
- return 0;
-}
-
static inline void do_btree_insert_one(struct btree_trans *trans,
struct btree_insert_entry *insert)
{
btree_insert_key_leaf(trans, insert);
}
-static inline bool update_triggers_transactional(struct btree_trans *trans,
- struct btree_insert_entry *i)
+static inline bool update_has_trans_triggers(struct btree_insert_entry *i)
{
- return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
- (i->iter->btree_id == BTREE_ID_EXTENTS ||
- i->iter->btree_id == BTREE_ID_INODES ||
- i->iter->btree_id == BTREE_ID_REFLINK);
+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->iter->btree_id);
}
-static inline bool update_has_triggers(struct btree_trans *trans,
- struct btree_insert_entry *i)
+static inline bool update_has_nontrans_triggers(struct btree_insert_entry *i)
{
- return likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- btree_node_type_needs_gc(i->iter->btree_id);
+ return (BTREE_NODE_TYPE_HAS_TRIGGERS &
+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
+ (1U << i->iter->btree_id);
}
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_btree_insert_at(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
+{
+ __bch2_btree_iter_unlock(iter);
+}
+
+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage = NULL;
struct btree_insert_entry *i;
- struct btree_iter *iter;
unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
? BCH_BUCKET_MARK_BUCKET_INVALIDATE
: 0;
- int ret;
- trans_for_each_update(trans, i)
- BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
+ if (unlikely(trans->flags & BTREE_INSERT_NOMARK))
+ return;
- /*
- * note: running triggers will append more updates to the list of
- * updates as we're walking it:
- */
trans_for_each_update(trans, i)
- if (update_has_triggers(trans, i) &&
- update_triggers_transactional(trans, i)) {
- ret = bch2_trans_mark_update(trans, i->iter, i->k);
- if (ret == -EINTR)
- trace_trans_restart_mark(trans->ip);
- if (ret)
- goto out_clear_replicas;
- }
-
- trans_for_each_iter(trans, iter) {
- if (iter->nodes_locked != iter->nodes_intent_locked) {
- BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
- BUG_ON(trans->iters_live & (1ULL << iter->idx));
- __bch2_btree_iter_unlock(iter);
- }
- }
-
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
- trans_for_each_update(trans, i)
- btree_insert_entry_checks(trans, i);
- bch2_btree_trans_verify_locks(trans);
-
- /*
- * No more updates can be added - sort updates so we can take write
- * locks in the correct order:
- */
- btree_trans_sort_updates(trans);
+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+ bch2_mark_update(trans, i, NULL,
+ mark_flags|BCH_BUCKET_MARK_GC);
+}
- btree_trans_lock_write(trans, true);
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans,
+ struct btree_insert_entry **stopped_at)
+{
+ struct bch_fs *c = trans->c;
+ struct bch_fs_usage *fs_usage = NULL;
+ struct btree_insert_entry *i;
+ unsigned mark_flags = trans->flags & BTREE_INSERT_BUCKET_INVALIDATE
+ ? BCH_BUCKET_MARK_BUCKET_INVALIDATE
+ : 0;
+ unsigned iter, u64s = 0;
+ bool marking = false;
+ int ret;
if (race_fault()) {
- ret = -EINTR;
trace_trans_restart_fault_inject(trans->ip);
- goto out;
+ return -EINTR;
}
/*
* held, otherwise another thread could write the node changing the
* amount of space available:
*/
- ret = btree_trans_check_can_insert(trans, stopped_at);
- if (ret)
- goto out;
- trans_for_each_update(trans, i) {
- if (!btree_node_type_needs_gc(i->iter->btree_id))
- continue;
+ prefetch(&trans->c->journal.flags);
- if (!fs_usage) {
- percpu_down_read(&c->mark_lock);
- fs_usage = bch2_fs_usage_scratch_get(c);
- }
+ trans_for_each_update_sorted(trans, i, iter) {
+ /* Multiple inserts might go to same leaf: */
+ if (!same_leaf_as_prev(trans, iter))
+ u64s = 0;
- if (!bch2_bkey_replicas_marked_locked(c,
- bkey_i_to_s_c(i->k), true)) {
- ret = BTREE_INSERT_NEED_MARK_REPLICAS;
- goto out;
+ u64s += i->k->k.u64s;
+ ret = btree_key_can_insert(trans, i, &u64s);
+ if (ret) {
+ *stopped_at = i;
+ return ret;
}
+
+ if (btree_node_type_needs_gc(i->iter->btree_id))
+ marking = true;
+ }
+
+ if (marking) {
+ percpu_down_read(&c->mark_lock);
+ fs_usage = bch2_fs_usage_scratch_get(c);
}
/*
* succeed:
*/
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- trans->journal_u64s = 0;
-
- trans_for_each_update(trans, i)
- trans->journal_u64s += jset_u64s(i->k->k.u64s);
-
- ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK);
+ ret = bch2_trans_journal_res_get(trans,
+ JOURNAL_RES_GET_NONBLOCK);
if (ret)
- goto out;
+ goto err;
}
+ /*
+ * Not allowed to fail after we've gotten our journal reservation - we
+ * have to use it:
+ */
+
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_update(trans, i)
i->k->k.version = MAX_VERSION;
}
+ /* Must be called under mark_lock: */
+ if (marking && trans->fs_usage_deltas &&
+ bch2_replicas_delta_list_apply(c, fs_usage,
+ trans->fs_usage_deltas)) {
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
+ goto err;
+ }
+
trans_for_each_update(trans, i)
- if (update_has_triggers(trans, i) &&
- !update_triggers_transactional(trans, i))
+ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ update_has_nontrans_triggers(i))
bch2_mark_update(trans, i, fs_usage, mark_flags);
- if (fs_usage && trans->fs_usage_deltas)
- bch2_replicas_delta_list_apply(c, fs_usage,
- trans->fs_usage_deltas);
-
- if (fs_usage)
+ if (marking)
bch2_trans_fs_usage_apply(trans, fs_usage);
- if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
- unlikely(c->gc_pos.phase))
- trans_for_each_update(trans, i)
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
- bch2_mark_update(trans, i, NULL,
- mark_flags|
- BCH_BUCKET_MARK_GC);
+ if (unlikely(c->gc_pos.phase))
+ bch2_trans_mark_gc(trans);
trans_for_each_update(trans, i)
do_btree_insert_one(trans, i);
-out:
- BUG_ON(ret &&
- (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) &&
- trans->journal_res.ref);
-
- btree_trans_lock_write(trans, false);
-
- if (fs_usage) {
+err:
+ if (marking) {
bch2_fs_usage_scratch_put(c, fs_usage);
percpu_up_read(&c->mark_lock);
}
- bch2_journal_res_put(&c->journal, &trans->journal_res);
-out_clear_replicas:
- if (trans->fs_usage_deltas) {
- memset(&trans->fs_usage_deltas->fs_usage, 0,
- sizeof(trans->fs_usage_deltas->fs_usage));
- trans->fs_usage_deltas->used = 0;
+ return ret;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans,
+ struct btree_insert_entry **stopped_at)
+{
+ struct btree_insert_entry *i;
+ struct btree_iter *iter;
+ unsigned idx, u64s, journal_preres_u64s = 0;
+ int ret;
+
+ /*
+ * note: running triggers will append more updates to the list of
+ * updates as we're walking it:
+ */
+ trans_for_each_update(trans, i) {
+ /* we know trans->nounlock won't be set here: */
+ if (unlikely(!(i->iter->locks_want < 1
+ ? __bch2_btree_iter_upgrade(i->iter, 1)
+ : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+ trace_trans_restart_upgrade(trans->ip);
+ return -EINTR;
+ }
+
+ if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) &&
+ update_has_trans_triggers(i)) {
+ ret = bch2_trans_mark_update(trans, i->iter, i->k);
+ if (unlikely(ret)) {
+ if (ret == -EINTR)
+ trace_trans_restart_mark(trans->ip);
+ return ret;
+ }
+ }
+
+ u64s = jset_u64s(i->k->k.u64s);
+ if (0)
+ journal_preres_u64s += u64s;
+ trans->journal_u64s += u64s;
}
- return ret;
+ ret = bch2_journal_preres_get(&trans->c->journal,
+ &trans->journal_preres, journal_preres_u64s,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (unlikely(ret == -EAGAIN))
+ ret = bch2_trans_journal_preres_get_cold(trans,
+ journal_preres_u64s);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * Can't be holding any read locks when we go to take write locks:
+ *
+ * note - this must be done after bch2_trans_journal_preres_get_cold()
+ * or anything else that might call bch2_trans_relock(), since that
+ * would just retake the read locks:
+ */
+ trans_for_each_iter_all(trans, iter) {
+ if (iter->nodes_locked != iter->nodes_intent_locked) {
+ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
+ EBUG_ON(trans->iters_live & (1ULL << iter->idx));
+ bch2_btree_iter_unlock_noinline(iter);
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+ trans_for_each_update(trans, i)
+ btree_insert_entry_checks(trans, i);
+ bch2_btree_trans_verify_locks(trans);
+
+ /*
+ * No more updates can be added - sort updates so we can take write
+ * locks in the correct order:
+ */
+ btree_trans_sort_updates(trans);
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_btree_node_lock_for_insert(trans->c,
+ i->iter->l[0].b, i->iter);
+
+ ret = bch2_trans_commit_write_locked(trans, stopped_at);
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+ i->iter);
+
+ /*
+ * Drop journal reservation after dropping write locks, since dropping
+ * the journal reservation may kick off a journal write:
+ */
+ bch2_journal_res_put(&trans->c->journal, &trans->journal_res);
+
+ if (unlikely(ret))
+ return ret;
+
+ if (trans->flags & BTREE_INSERT_NOUNLOCK)
+ trans->nounlock = true;
+
+ trans_for_each_update_sorted(trans, i, idx)
+ if (!same_leaf_as_prev(trans, idx))
+ bch2_foreground_maybe_merge(trans->c, i->iter,
+ 0, trans->flags);
+
+ trans->nounlock = false;
+
+ trans_for_each_update(trans, i)
+ bch2_btree_iter_downgrade(i->iter);
+
+ return 0;
}
static noinline
return ret;
}
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- * if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
- */
-static int __bch2_trans_commit(struct btree_trans *trans,
- struct btree_insert_entry **stopped_at)
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- unsigned iter;
int ret;
- trans_for_each_update(trans, i) {
- if (!bch2_btree_iter_upgrade(i->iter, 1)) {
- trace_trans_restart_upgrade(trans->ip);
- ret = -EINTR;
- goto err;
- }
-
- ret = btree_iter_err(i->iter);
- if (ret)
- goto err;
- }
-
- ret = do_btree_insert_at(trans, stopped_at);
- if (unlikely(ret))
- goto err;
-
- if (trans->flags & BTREE_INSERT_NOUNLOCK)
- trans->nounlock = true;
-
- trans_for_each_update_sorted(trans, i, iter)
- if (!same_leaf_as_prev(trans, iter))
- bch2_foreground_maybe_merge(c, i->iter,
- 0, trans->flags);
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+ return -EROFS;
- trans->nounlock = false;
+ bch2_trans_unlock(trans);
- trans_for_each_update(trans, i)
- bch2_btree_iter_downgrade(i->iter);
-err:
- /* make sure we didn't drop or screw up locks: */
- bch2_btree_trans_verify_locks(trans);
+ ret = bch2_fs_read_write_early(c);
+ if (ret)
+ return ret;
- return ret;
+ percpu_ref_get(&c->writes);
+ return 0;
}
-int bch2_trans_commit(struct btree_trans *trans,
- struct disk_reservation *disk_res,
- u64 *journal_seq,
- unsigned flags)
+int __bch2_trans_commit(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_iter *iter;
unsigned orig_nr_updates = trans->nr_updates;
goto out_noupdates;
/* for the sake of sanity: */
- BUG_ON(trans->nr_updates > 1 && !(flags & BTREE_INSERT_ATOMIC));
-
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
+ EBUG_ON(trans->nr_updates > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
- if (!trans->commit_start)
- trans->commit_start = local_clock();
+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&trans->c->gc_lock);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
- trans->disk_res = disk_res;
- trans->journal_seq = journal_seq;
- trans->flags = flags;
-
- if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
- !percpu_ref_tryget(&c->writes))) {
- if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
- return -EROFS;
-
- bch2_trans_unlock(trans);
- ret = bch2_fs_read_write_early(c);
+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
+ unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+ ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
return ret;
+ }
+retry:
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ trans->journal_u64s = 0;
- percpu_ref_get(&c->writes);
+ ret = do_bch2_trans_commit(trans, &i);
- if (!bch2_trans_relock(trans)) {
- ret = -EINTR;
- goto err;
- }
+ if (trans->fs_usage_deltas) {
+ trans->fs_usage_deltas->used = 0;
+ memset(&trans->fs_usage_deltas->memset_start, 0,
+ (void *) &trans->fs_usage_deltas->memset_end -
+ (void *) &trans->fs_usage_deltas->memset_start);
}
-retry:
- ret = bch2_trans_journal_preres_get(trans);
- if (ret)
- goto err;
- ret = __bch2_trans_commit(trans, &i);
+ /* make sure we didn't drop or screw up locks: */
+ bch2_btree_trans_verify_locks(trans);
+
if (ret)
goto err;
out:
- bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
- if (unlikely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
- percpu_ref_put(&c->writes);
+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
+ percpu_ref_put(&trans->c->writes);
out_noupdates:
- if (!ret && trans->commit_start) {
- bch2_time_stats_update(&c->times[BCH_TIME_btree_update],
- trans->commit_start);
- trans->commit_start = 0;
- }
-
- BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+ EBUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
- trans_for_each_iter(trans, iter)
+ trans_for_each_iter_all(trans, iter)
iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
if (!ret) {
err:
ret = bch2_trans_commit_error(trans, i, ret);
- /* free updates and memory used by triggers, they'll be reexecuted: */
- trans->nr_updates = orig_nr_updates;
- trans->mem_top = orig_mem_top;
-
/* can't loop if it was passed in and we changed it: */
if (unlikely(trans->flags & BTREE_INSERT_NO_CLEAR_REPLICAS) && !ret)
ret = -EINTR;
+ if (ret)
+ goto out;
- if (!ret)
- goto retry;
-
- goto out;
+ /* free updates and memory used by triggers, they'll be reexecuted: */
+ trans->nr_updates = orig_nr_updates;
+ trans->mem_top = orig_mem_top;
+ goto retry;
}
/**
}
}
-static inline void update_replicas(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
- s64 sectors)
+static inline int update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
- BUG_ON(idx < 0);
+ if (idx < 0)
+ return -1;
+
+ if (!fs_usage)
+ return 0;
switch (r->data_type) {
case BCH_DATA_BTREE:
break;
}
fs_usage->replicas[idx] += sectors;
+ return 0;
}
static inline void update_cached_sectors(struct bch_fs *c,
update_replicas_list(trans, &r.e, sectors);
}
-void bch2_replicas_delta_list_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct replicas_delta_list *r)
+static inline struct replicas_delta *
+replicas_delta_next(struct replicas_delta *d)
+{
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
+}
+
+int bch2_replicas_delta_list_apply(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct replicas_delta_list *r)
{
struct replicas_delta *d = r->d;
struct replicas_delta *top = (void *) r->d + r->used;
+ unsigned i;
- acc_u64s((u64 *) fs_usage,
- (u64 *) &r->fs_usage, sizeof(*fs_usage) / sizeof(u64));
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ if (update_replicas(c, fs_usage, &d->r, d->delta)) {
+ top = d;
+ goto unwind;
+ }
- while (d != top) {
- BUG_ON((void *) d > (void *) top);
+ if (!fs_usage)
+ return 0;
- update_replicas(c, fs_usage, &d->r, d->delta);
+ fs_usage->nr_inodes += r->nr_inodes;
- d = (void *) d + replicas_entry_bytes(&d->r) + 8;
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ fs_usage->reserved += r->persistent_reserved[i];
+ fs_usage->persistent_reserved[i] += r->persistent_reserved[i];
}
+
+ return 0;
+unwind:
+ for (d = r->d; d != top; d = replicas_delta_next(d))
+ update_replicas(c, fs_usage, &d->r, -d->delta);
+ return -1;
}
#define do_mark_fn(fn, c, pos, flags, ...) \
if (ret < 0)
return ret;
- if (!ret) {
+ if (!ret && unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) {
/*
* During journal replay, and if gc repairs alloc info at
* runtime, the alloc info in the btree might not be up to date
d = replicas_deltas_realloc(trans, 0);
if (!(flags & BCH_BUCKET_MARK_OVERWRITE))
- d->fs_usage.nr_inodes++;
+ d->nr_inodes++;
else
- d->fs_usage.nr_inodes--;
+ d->nr_inodes--;
return 0;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
sectors *= replicas;
replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->fs_usage.persistent_reserved));
+ ARRAY_SIZE(d->persistent_reserved));
- d->fs_usage.reserved += sectors;
- d->fs_usage.persistent_reserved[replicas - 1] += sectors;
+ d->persistent_reserved[replicas - 1] += sectors;
return 0;
}
case KEY_TYPE_reflink_p:
int bch2_mark_update(struct btree_trans *, struct btree_insert_entry *,
struct bch_fs_usage *, unsigned);
-void bch2_replicas_delta_list_apply(struct bch_fs *,
- struct bch_fs_usage *,
- struct replicas_delta_list *);
+int bch2_replicas_delta_list_apply(struct bch_fs *,
+ struct bch_fs_usage *,
+ struct replicas_delta_list *);
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
unsigned, s64, unsigned);
int bch2_trans_mark_update(struct btree_trans *,
struct replicas_delta_list {
unsigned size;
unsigned used;
- struct bch_fs_usage fs_usage;
+
+ struct {} memset_start;
+ u64 nr_inodes;
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
+ struct {} memset_end;
struct replicas_delta d[0];
};
return true;
}
-static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
- struct bkey_i *src)
-{
- struct bkey_format *f = &b->format;
- struct bkey_i *dst_unpacked;
- struct bkey_packed tmp;
-
- if ((dst_unpacked = packed_to_bkey(dst)))
- dst_unpacked->k = src->k;
- else if (bch2_bkey_pack_key(&tmp, &src->k, f))
- memcpy_u64s(dst, &tmp, f->key_u64s);
- else
- return false;
-
- memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
- return true;
-}
-
-static bool bch2_extent_merge_inline(struct bch_fs *,
- struct btree_iter *,
- struct bkey_packed *,
- struct bkey_packed *,
- bool);
-
-static void verify_extent_nonoverlapping(struct bch_fs *c,
- struct btree *b,
- struct btree_node_iter *_iter,
- struct bkey_i *insert)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- struct btree_node_iter iter;
- struct bkey_packed *k;
- struct bkey uk;
-
- if (!expensive_debug_checks(c))
- return;
-
- iter = *_iter;
- k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
- iter = *_iter;
- k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
- BUG_ON(k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
- if (k &&
- (uk = bkey_unpack_key(b, k),
- bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
- char buf1[100];
- char buf2[100];
-
- bch2_bkey_to_text(&PBUF(buf1), &insert->k);
- bch2_bkey_to_text(&PBUF(buf2), &uk);
-
- bch2_dump_btree_node(b);
- panic("insert > next :\n"
- "insert %s\n"
- "next %s\n",
- buf1, buf2);
- }
-#endif
-
-#endif
-}
-
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
- struct bkey_i *insert)
-{
- struct btree_iter_level *l = &iter->l[0];
- struct btree_node_iter node_iter;
- struct bkey_packed *k;
-
- BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
- EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
- verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
-
- node_iter = l->iter;
- k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
- if (k && !bkey_written(l->b, k) &&
- bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
- return;
-
- node_iter = l->iter;
- k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
- if (k && !bkey_written(l->b, k) &&
- bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
- return;
-
- /*
- * may have skipped past some deleted extents greater than the insert
- * key, before we got to a non deleted extent and knew we could bail out
- * rewind the iterator a bit if necessary:
- */
- node_iter = l->iter;
- while ((k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
- bkey_cmp_left_packed(l->b, k, &insert->k.p) > 0)
- l->iter = node_iter;
-
- k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
- bch2_bset_insert(l->b, &l->iter, k, insert, 0);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
-}
-
static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
return BTREE_INSERT_OK;
}
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter *_iter,
+ struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct btree_node_iter iter;
+ struct bkey_packed *k;
+ struct bkey uk;
+
+ if (!expensive_debug_checks(c))
+ return;
+
+ iter = *_iter;
+ k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+ BUG_ON(k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+ iter = *_iter;
+ k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+ BUG_ON(k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+ if (k &&
+ (uk = bkey_unpack_key(b, k),
+ bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+ char buf1[100];
+ char buf2[100];
+
+ bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+ bch2_dump_btree_node(b);
+ panic("insert > next :\n"
+ "insert %s\n"
+ "next %s\n",
+ buf1, buf2);
+ }
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+ struct bkey_i *insert)
+{
+ struct btree_iter_level *l = &iter->l[0];
+ struct bkey_packed *k =
+ bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+ BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+ EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+ verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+ if (debug_check_bkeys(c))
+ bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+ bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+ bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
static void
extent_squash(struct bch_fs *c, struct btree_iter *iter,
struct bkey_i *insert,
__bch2_cut_front(insert->k.p, k);
EBUG_ON(bkey_deleted(k.k));
extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
break;
case BCH_EXTENT_OVERLAP_BACK:
_k, u64s, 0);
} else {
extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
}
break;
__bch2_cut_front(insert->k.p, k);
BUG_ON(bkey_deleted(k.k));
extent_save(l->b, _k, k.k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
+ bch2_btree_iter_fix_key_modified(iter, l->b, _k);
extent_bset_insert(c, iter, &split.k);
break;
}
}
-struct extent_insert_state {
- struct bkey_i whiteout;
- bool update_journal;
- bool update_btree;
- bool deleting;
-};
-
-static void __bch2_insert_fixup_extent(struct bch_fs *c,
- struct btree_iter *iter,
- struct bkey_i *insert,
- struct extent_insert_state *s)
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * ∀ k, j
+ * k.size != 0 ∧ j.size != 0 →
+ * ¬ (k > bkey_start_pos(j) ∧ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+ struct btree_insert_entry *insert_entry)
{
+ struct bch_fs *c = trans->c;
+ struct btree_iter *iter = insert_entry->iter;
+ struct bkey_i *insert = insert_entry->k;
struct btree_iter_level *l = &iter->l[0];
+ struct btree_node_iter node_iter = l->iter;
+ bool deleting = bkey_whiteout(&insert->k);
+ bool update_journal = !deleting;
+ bool update_btree = !deleting;
+ struct bkey_i whiteout = *insert;
struct bkey_packed *_k;
struct bkey unpacked;
+ BKEY_PADDED(k) tmp;
+
+ EBUG_ON(iter->level);
+ EBUG_ON(!insert->k.size);
+ EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
KEY_TYPE_discard))) {
break;
if (!bkey_whiteout(k.k))
- s->update_journal = true;
+ update_journal = true;
- if (!s->update_journal) {
+ if (!update_journal) {
bch2_cut_front(cur_end, insert);
- bch2_cut_front(cur_end, &s->whiteout);
+ bch2_cut_front(cur_end, &whiteout);
bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
goto next;
}
* of the key we're deleting, instead of creating and inserting
* a new whiteout:
*/
- if (s->deleting &&
- !s->update_btree &&
+ if (deleting &&
+ !update_btree &&
!bkey_cmp(insert->k.p, k.k->p) &&
!bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
if (!bkey_whiteout(k.k)) {
btree_account_key_drop(l->b, _k);
_k->type = KEY_TYPE_discard;
reserve_whiteout(l->b, _k);
- bch2_btree_node_iter_fix(iter, l->b, &l->iter,
- _k, _k->u64s, _k->u64s);
+ bch2_btree_iter_fix_key_modified(iter,
+ l->b, _k);
}
break;
}
if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
insert->k.needs_whiteout = true;
- s->update_btree = true;
+ update_btree = true;
}
- if (s->update_btree &&
+ if (update_btree &&
overlap == BCH_EXTENT_OVERLAP_ALL &&
bkey_whiteout(k.k) &&
k.k->needs_whiteout) {
extent_squash(c, iter, insert, _k, k, overlap);
- if (!s->update_btree)
+ if (!update_btree)
bch2_cut_front(cur_end, insert);
next:
+ node_iter = l->iter;
+
if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
overlap == BCH_EXTENT_OVERLAP_MIDDLE)
break;
}
-}
-
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * ∀ k, j
- * k.size != 0 ∧ j.size != 0 →
- * ¬ (k > bkey_start_pos(j) ∧ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
- struct btree_insert_entry *insert)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter *iter = insert->iter;
- struct extent_insert_state s = {
- .whiteout = *insert->k,
- .update_journal = !bkey_whiteout(&insert->k->k),
- .update_btree = !bkey_whiteout(&insert->k->k),
- .deleting = bkey_whiteout(&insert->k->k),
- };
- BKEY_PADDED(k) tmp;
-
- EBUG_ON(iter->level);
- EBUG_ON(!insert->k->k.size);
- EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-
- __bch2_insert_fixup_extent(c, iter, insert->k, &s);
- bch2_btree_iter_set_pos_same_leaf(iter, insert->k->k.p);
+ l->iter = node_iter;
+ bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
- if (s.update_btree) {
- bkey_copy(&tmp.k, insert->k);
+ if (update_btree) {
+ bkey_copy(&tmp.k, insert);
- if (s.deleting)
+ if (deleting)
tmp.k.k.type = KEY_TYPE_discard;
EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
extent_bset_insert(c, iter, &tmp.k);
}
- if (s.update_journal) {
- bkey_copy(&tmp.k, !s.deleting ? insert->k : &s.whiteout);
+ if (update_journal) {
+ bkey_copy(&tmp.k, !deleting ? insert : &whiteout);
- if (s.deleting)
+ if (deleting)
tmp.k.k.type = KEY_TYPE_discard;
EBUG_ON(bkey_deleted(&tmp.k.k) || !tmp.k.k.size);
bch2_btree_journal_key(trans, iter, &tmp.k);
}
- bch2_cut_front(insert->k->k.p, insert->k);
+ bch2_cut_front(insert->k.p, insert);
}
const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
#undef set_common_fields
}
-static void bch2_extent_crc_append(struct bkey_i *k,
- struct bch_extent_crc_unpacked new)
+void bch2_extent_crc_append(struct bkey_i *k,
+ struct bch_extent_crc_unpacked new)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
union bch_extent_crc *crc = (void *) ptrs.end;
{
union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
- memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
- dst, (u64 *) end - (u64 *) dst);
+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
+ dst, (u64 *) end - (u64 *) dst);
k->k.u64s += extent_entry_u64s(new);
memcpy(dst, new, extent_entry_bytes(new));
}
return BCH_MERGE_MERGE;
}
-/*
- * When merging an extent that we're inserting into a btree node, the new merged
- * extent could overlap with an existing 0 size extent - if we don't fix that,
- * it'll break the btree node iterator so this code finds those 0 size extents
- * and shifts them out of the way.
- *
- * Also unpacks and repacks.
- */
-static bool bch2_extent_merge_inline(struct bch_fs *c,
- struct btree_iter *iter,
- struct bkey_packed *l,
- struct bkey_packed *r,
- bool back_merge)
-{
- struct btree *b = iter->l[0].b;
- struct btree_node_iter *node_iter = &iter->l[0].iter;
- BKEY_PADDED(k) li, ri;
- struct bkey_packed *m = back_merge ? l : r;
- struct bkey_i *mi = back_merge ? &li.k : &ri.k;
- struct bset_tree *t = bch2_bkey_to_bset(b, m);
- enum merge_result ret;
-
- EBUG_ON(bkey_written(b, m));
-
- if (bkey_val_u64s(l) > BKEY_EXTENT_VAL_U64s_MAX ||
- bkey_val_u64s(r) > BKEY_EXTENT_VAL_U64s_MAX)
- return BCH_MERGE_NOMERGE;
-
- /*
- * We need to save copies of both l and r, because we might get a
- * partial merge (which modifies both) and then fails to repack
- */
- bch2_bkey_unpack(b, &li.k, l);
- bch2_bkey_unpack(b, &ri.k, r);
-
- ret = bch2_bkey_merge(c,
- bkey_i_to_s(&li.k),
- bkey_i_to_s(&ri.k));
- if (ret == BCH_MERGE_NOMERGE)
- return false;
-
- if (debug_check_bkeys(c))
- bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&li.k));
- if (debug_check_bkeys(c) &&
- ret == BCH_MERGE_PARTIAL)
- bch2_bkey_debugcheck(c, b, bkey_i_to_s_c(&ri.k));
-
- /*
- * check if we overlap with deleted extents - would break the sort
- * order:
- */
- if (back_merge) {
- struct bkey_packed *n = bkey_next(m);
-
- if (n != btree_bkey_last(b, t) &&
- bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
- bkey_deleted(n))
- return false;
- } else if (ret == BCH_MERGE_MERGE) {
- struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
-
- if (prev &&
- bkey_cmp_left_packed_byval(b, prev,
- bkey_start_pos(&li.k.k)) > 0)
- return false;
- }
-
- if (ret == BCH_MERGE_PARTIAL) {
- if (!extent_i_save(b, m, mi))
- return false;
-
- if (!back_merge)
- bkey_copy(packed_to_bkey(l), &li.k);
- else
- bkey_copy(packed_to_bkey(r), &ri.k);
- } else {
- if (!extent_i_save(b, m, &li.k))
- return false;
- }
-
- bch2_bset_fix_invalidated_key(b, m);
- bch2_btree_node_iter_fix(iter, b, node_iter,
- m, m->u64s, m->u64s);
-
- return ret == BCH_MERGE_MERGE;
-}
-
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas)
{
__bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
extent_entry_last(_e), _ptr, _entry)
+void bch2_extent_crc_append(struct bkey_i *,
+ struct bch_extent_crc_unpacked);
void bch2_extent_ptr_decoded_append(struct bkey_i *,
struct extent_ptr_decoded *);
};
struct dio_write {
- struct closure cl;
+ struct completion done;
struct kiocb *req;
struct mm_struct *mm;
unsigned loop:1,
unsigned sectors = sectors_to_reserve(&s->s[i],
res->disk.nr_replicas);
- BUG_ON(sectors > res->disk.sectors);
+ /*
+ * This can happen if we race with the error path in
+ * bch2_writepage_io_done():
+ */
+ sectors = min_t(unsigned, sectors, res->disk.sectors);
+
s->s[i].replicas_reserved += sectors;
res->disk.sectors -= sectors;
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
bio_full(&w->io->op.wbio.bio) ||
+ w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
/* O_DIRECT writes */
-static void bch2_dio_write_loop_async(struct closure *);
-
static long bch2_dio_write_loop(struct dio_write *dio)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
if (dio->loop)
goto loop;
- inode_dio_begin(&inode->v);
- __pagecache_block_get(&mapping->add_lock);
-
/* Write and invalidate pagecache range that we're writing to: */
offset = req->ki_pos + (dio->op.written << 9);
ret = write_invalidate_inode_pages_range(mapping,
task_io_account_write(bio->bi_iter.bi_size);
- closure_call(&dio->op.cl, bch2_write, NULL, &dio->cl);
-
if (!dio->sync && !dio->loop && dio->iter.count) {
struct iovec *iov = dio->inline_vecs;
iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
GFP_KERNEL);
if (unlikely(!iov)) {
- dio->op.error = -ENOMEM;
- goto err_wait_io;
+ dio->sync = true;
+ goto do_io;
}
dio->free_iov = true;
memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
dio->iter.iov = iov;
}
-err_wait_io:
+do_io:
dio->loop = true;
+ closure_call(&dio->op.cl, bch2_write, NULL, NULL);
- if (!dio->sync) {
- continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
+ if (dio->sync)
+ wait_for_completion(&dio->done);
+ else
return -EIOCBQUEUED;
- }
-
- closure_sync(&dio->cl);
loop:
i_sectors_acct(c, inode, &dio->quota_res,
dio->op.i_sectors_delta);
put_page(bv->bv_page);
if (!dio->iter.count || dio->op.error)
break;
+
bio_reset(bio);
+ reinit_completion(&dio->done);
}
ret = dio->op.error ?: ((long) dio->op.written << 9);
if (dio->free_iov)
kfree(dio->iter.iov);
- closure_debug_destroy(&dio->cl);
-
sync = dio->sync;
bio_put(bio);
return ret;
}
-static void bch2_dio_write_loop_async(struct closure *cl)
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
{
- struct dio_write *dio = container_of(cl, struct dio_write, cl);
+ struct dio_write *dio = container_of(op, struct dio_write, op);
- bch2_dio_write_loop(dio);
+ if (dio->sync)
+ complete(&dio->done);
+ else
+ bch2_dio_write_loop(dio);
}
-static int bch2_direct_IO_write(struct kiocb *req,
- struct iov_iter *iter,
- bool swap)
+static noinline
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
{
struct file *file = req->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct dio_write *dio;
struct bio *bio;
+ bool locked = true, extending;
ssize_t ret;
- lockdep_assert_held(&inode->v.i_rwsem);
+ prefetch(&c->opts);
+ prefetch((void *) &c->opts + 64);
+ prefetch(&inode->ei_inode);
+ prefetch((void *) &inode->ei_inode + 64);
- if (unlikely(!iter->count))
- return 0;
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(req, iter);
+ if (unlikely(ret <= 0))
+ goto err;
+
+ ret = file_remove_privs(file);
+ if (unlikely(ret))
+ goto err;
+
+ ret = file_update_time(file);
+ if (unlikely(ret))
+ goto err;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
- return -EINVAL;
+ goto err;
+
+ inode_dio_begin(&inode->v);
+ __pagecache_block_get(&mapping->add_lock);
+
+ extending = req->ki_pos + iter->count > inode->v.i_size;
+ if (!extending) {
+ inode_unlock(&inode->v);
+ locked = false;
+ }
bio = bio_alloc_bioset(GFP_KERNEL,
iov_iter_npages(iter, BIO_MAX_PAGES),
&c->dio_write_bioset);
dio = container_of(bio, struct dio_write, op.wbio.bio);
- closure_init(&dio->cl, NULL);
+ init_completion(&dio->done);
dio->req = req;
dio->mm = current->mm;
dio->loop = false;
- dio->sync = is_sync_kiocb(req) ||
- req->ki_pos + iter->count > inode->v.i_size;
+ dio->sync = is_sync_kiocb(req) || extending;
dio->free_iov = false;
dio->quota_res.sectors = 0;
dio->iter = *iter;
bch2_write_op_init(&dio->op, c, opts);
+ dio->op.end_io = bch2_dio_write_loop_async;
dio->op.target = opts.foreground_target;
op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
dio->op.write_point = writepoint_hashed((unsigned long) current);
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
iter->count >> 9, true);
if (unlikely(ret))
- goto err;
+ goto err_put_bio;
dio->op.nr_replicas = dio->op.opts.data_replicas;
req->ki_pos >> 9),
iter->count >> 9,
dio->op.opts.data_replicas))
- goto err;
+ goto err_put_bio;
- return bch2_dio_write_loop(dio);
+ ret = bch2_dio_write_loop(dio);
err:
+ if (locked)
+ inode_unlock(&inode->v);
+ if (ret > 0)
+ req->ki_pos += ret;
+ return ret;
+err_put_bio:
+ __pagecache_block_put(&mapping->add_lock);
bch2_disk_reservation_put(c, &dio->op.res);
bch2_quota_reservation_put(c, inode, &dio->quota_res);
- closure_debug_destroy(&dio->cl);
bio_put(bio);
- return ret;
+ inode_dio_end(&inode->v);
+ goto err;
}
ssize_t bch2_direct_IO(struct kiocb *req, struct iov_iter *iter)
struct blk_plug plug;
ssize_t ret;
+ if (iov_iter_rw(iter) == WRITE)
+ return -EINVAL;
+
blk_start_plug(&plug);
- ret = iov_iter_rw(iter) == WRITE
- ? bch2_direct_IO_write(req, iter, false)
- : bch2_direct_IO_read(req, iter);
+ ret = bch2_direct_IO_read(req, iter);
blk_finish_plug(&plug);
return ret;
}
-static ssize_t
-bch2_direct_write(struct kiocb *iocb, struct iov_iter *iter)
-{
- return bch2_direct_IO_write(iocb, iter, true);
-}
-
-static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
- ssize_t ret;
+ ssize_t ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return bch2_direct_write(iocb, from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(&inode->v);
+ inode_lock(&inode->v);
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto unlock;
+
ret = file_remove_privs(file);
if (ret)
- goto out;
+ goto unlock;
ret = file_update_time(file);
if (ret)
- goto out;
-
- ret = iocb->ki_flags & IOCB_DIRECT
- ? bch2_direct_write(iocb, from)
- : bch2_buffered_write(iocb, from);
+ goto unlock;
+ ret = bch2_buffered_write(iocb, from);
if (likely(ret > 0))
iocb->ki_pos += ret;
-out:
+unlock:
+ inode_unlock(&inode->v);
current->backing_dev_info = NULL;
- return ret;
-}
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
- struct bch_inode_info *inode = file_bch_inode(iocb->ki_filp);
- bool direct = iocb->ki_flags & IOCB_DIRECT;
- ssize_t ret;
-
- inode_lock(&inode->v);
- ret = generic_write_checks(iocb, from);
if (ret > 0)
- ret = __bch2_write_iter(iocb, from);
- inode_unlock(&inode->v);
-
- if (ret > 0 && !direct)
ret = generic_write_sync(iocb, ret);
return ret;
loff_t offset, loff_t len)
{
struct bch_inode_info *inode = file_bch_inode(file);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ long ret;
- if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
- return bchfs_fallocate(inode, mode, offset, len);
-
- if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
- return bchfs_fpunch(inode, offset, len);
+ if (!percpu_ref_tryget(&c->writes))
+ return -EROFS;
- if (mode == FALLOC_FL_INSERT_RANGE)
- return bchfs_fcollapse_finsert(inode, offset, len, true);
+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
+ ret = bchfs_fallocate(inode, mode, offset, len);
+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
+ ret = bchfs_fpunch(inode, offset, len);
+ else if (mode == FALLOC_FL_INSERT_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, true);
+ else if (mode == FALLOC_FL_COLLAPSE_RANGE)
+ ret = bchfs_fcollapse_finsert(inode, offset, len, false);
+ else
+ ret = -EOPNOTSUPP;
- if (mode == FALLOC_FL_COLLAPSE_RANGE)
- return bchfs_fcollapse_finsert(inode, offset, len, false);
+ percpu_ref_put(&c->writes);
- return -EOPNOTSUPP;
+ return ret;
}
static void mark_range_unallocated(struct bch_inode_info *inode,
static void hash_check_init(struct hash_check *h)
{
h->chain = NULL;
+ h->chain_end = 0;
}
static void hash_stop_chain(struct btree_trans *trans,
{
struct btree_iter *iter;
struct bkey_s_c k;
- int ret = -ENOENT;
+ int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
POS(inode_nr, 0), BTREE_ITER_SLOTS);
return PTR_ERR(iter);
k = bch2_btree_iter_peek_slot(iter);
- if (k.k->type == KEY_TYPE_inode)
- ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+ ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = k.k->type == KEY_TYPE_inode
+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+ : -ENOENT;
bch2_trans_iter_put(trans, iter);
bch2_trans_update(trans, iter, k);
ret = bch2_trans_commit(trans, disk_res, journal_seq,
+ BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_ATOMIC|
BTREE_INSERT_USE_RESERVE);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
- closure_return(cl);
+ if (op->end_io)
+ op->end_io(op);
+ if (cl->parent)
+ closure_return(cl);
+ else
+ closure_debug_destroy(cl);
}
/**
if (parent)
bio_endio(&parent->bio);
- else
+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
closure_put(cl);
+ else
+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
}
static void init_append_extent(struct bch_write_op *op,
struct bch_extent_crc_unpacked crc)
{
struct bch_fs *c = op->c;
- struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
- struct extent_ptr_decoded p = { .crc = crc };
+ struct bkey_i_extent *e;
struct open_bucket *ob;
unsigned i;
+ BUG_ON(crc.compressed_size > wp->sectors_free);
+ wp->sectors_free -= crc.compressed_size;
op->pos.offset += crc.uncompressed_size;
+
+ e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
e->k.version = version;
- BUG_ON(crc.compressed_size > wp->sectors_free);
- wp->sectors_free -= crc.compressed_size;
+ if (crc.csum_type ||
+ crc.compression_type ||
+ crc.nonce)
+ bch2_extent_crc_append(&e->k_i, crc);
open_bucket_for_each(c, &wp->ptrs, ob, i) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+ union bch_extent_entry *end =
+ bkey_val_end(bkey_i_to_s(&e->k_i));
- p.ptr = ob->ptr;
- p.ptr.cached = !ca->mi.durability ||
+ end->ptr = ob->ptr;
+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+ end->ptr.cached = !ca->mi.durability ||
(op->flags & BCH_WRITE_CACHED) != 0;
- p.ptr.offset += ca->mi.bucket_size - ob->sectors_free;
- bch2_extent_ptr_decoded_append(&e->k_i, &p);
+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
+
+ e->k.u64s++;
BUG_ON(crc.compressed_size > ob->sectors_free);
ob->sectors_free -= crc.compressed_size;
return PREP_ENCODED_OK;
}
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+ struct bio **_dst)
{
struct bch_fs *c = op->c;
struct bio *src = &op->wbio.bio, *dst = src;
struct bvec_iter saved_iter;
- struct bkey_i *key_to_write;
void *ec_buf;
- unsigned key_to_write_offset = op->insert_keys.top_p -
- op->insert_keys.keys_p;
+ struct bpos ec_pos = op->pos;
unsigned total_output = 0, total_input = 0;
bool bounce = false;
bool page_alloc_failed = false;
case PREP_ENCODED_CHECKSUM_ERR:
goto csum_err;
case PREP_ENCODED_DO_WRITE:
+ /* XXX look for bug here */
if (ec_buf) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
dst->bi_iter.bi_size = total_output;
do_write:
/* might have done a realloc... */
+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
- key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
- bch2_ec_add_backpointer(c, wp,
- bkey_start_pos(&key_to_write->k),
- total_input >> 9);
-
- dst->bi_end_io = bch2_write_endio;
- dst->bi_private = &op->cl;
- bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
-
- closure_get(dst->bi_private);
-
- bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
- key_to_write);
+ *_dst = dst;
return more;
csum_err:
bch_err(c, "error verifying existing checksum while "
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct write_point *wp;
+ struct bio *bio;
+ bool skip_put = true;
int ret;
again:
memset(&op->failed, 0, sizeof(op->failed));
do {
+ struct bkey_i *key_to_write;
+ unsigned key_to_write_offset = op->insert_keys.top_p -
+ op->insert_keys.keys_p;
+
/* +1 for possible cache device: */
if (op->open_buckets.nr + op->nr_replicas + 1 >
ARRAY_SIZE(op->open_buckets.v))
goto flush_io;
}
- ret = bch2_write_extent(op, wp);
-
bch2_open_bucket_get(c, wp, &op->open_buckets);
+ ret = bch2_write_extent(op, wp, &bio);
bch2_alloc_sectors_done(c, wp);
if (ret < 0)
goto err;
+
+ if (ret)
+ skip_put = false;
+
+ bio->bi_end_io = bch2_write_endio;
+ bio->bi_private = &op->cl;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ if (!skip_put)
+ closure_get(bio->bi_private);
+ else
+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
+
+ key_to_write = (void *) (op->insert_keys.keys_p +
+ key_to_write_offset);
+
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ key_to_write);
} while (ret);
- continue_at(cl, bch2_write_index, index_update_wq(op));
+ if (!skip_put)
+ continue_at(cl, bch2_write_index, index_update_wq(op));
return;
err:
op->error = ret;
- continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
- ? bch2_write_index
- : bch2_write_done, index_update_wq(op));
+ continue_at(cl, bch2_write_index, index_update_wq(op));
return;
flush_io:
closure_sync(cl);
int ret;
flags &= ~BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_MUST_CLONE;
bch2_trans_init(&trans, c, 0, 0);
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 9),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
struct bch_io_opts opts)
{
op->c = c;
- op->io_wq = index_update_wq(op);
+ op->end_io = NULL;
op->flags = 0;
op->written = 0;
op->error = 0;
struct bch_write_op {
struct closure cl;
struct bch_fs *c;
- struct workqueue_struct *io_wq;
+ void (*end_io)(struct bch_write_op *);
u64 start_time;
unsigned written; /* sectors */
struct bch_devs_list devs_have;
u16 target;
u16 nonce;
-
struct bch_io_opts opts;
struct bpos pos;
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _RET_IP_);
+ lock_release(&j->res_map, 0, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
- DEBUG_MEMORY_FREED(w->data, w->buf_size);
-
BUG_ON(!j->reservations.prev_buf_unwritten);
atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
&j->reservations.counter);
goto err;
}
bch_verbose(c, "alloc write done");
+
+ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
}
if (!c->sb.clean) {
u64 src_done, dst_done;
int ret = 0, ret2 = 0;
+ if (!percpu_ref_tryget(&c->writes))
+ return -EROFS;
+
if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
mutex_lock(&c->sb_lock);
if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
ret = bch2_trans_exit(&trans) ?: ret;
+ percpu_ref_put(&c->writes);
+
return dst_done ?: ret ?: ret2;
}
void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+static inline void memcpy_u64s_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ u64 *d = dst;
+ const u64 *s = src;
+
+ while (u64s--)
+ *d++ = *s++;
+}
+
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
__memmove_u64s_down(dst, src, u64s);
}
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
+ unsigned u64s)
+{
+ u64 *dst = (u64 *) _dst + u64s;
+ u64 *src = (u64 *) _src + u64s;
+
+ while (u64s--)
+ *--dst = *--src;
+}
+
+static inline void memmove_u64s_up_small(void *dst, const void *src,
+ unsigned u64s)
+{
+ EBUG_ON(dst < src);
+
+ __memmove_u64s_up_small(dst, src, u64s);
+}
+
static inline void __memmove_u64s_up(void *_dst, const void *_src,
unsigned u64s)
{