-a27d7265e75f6d65c2b972ce4ac27abfc153c230
+e1f6739c4a9fee1db7d94a5087a253041542cb62
git rm -rf --ignore-unmatch libbcachefs
test -d libbcachefs || mkdir libbcachefs
cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/
+ git add libbcachefs/*.[ch]
cp $(LINUX_DIR)/include/trace/events/bcachefs.h include/trace/events/
+ git add include/trace/events/bcachefs.h
+ cp $(LINUX_DIR)/kernel/locking/six.c linux/
+ git add linux/six.c
+ cp $(LINUX_DIR)/include/linux/six.h include/linux/
+ git add include/linux/six.h
$(RM) libbcachefs/*.mod.c
git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
- git add libbcachefs/*.[ch] include/trace/events/bcachefs.h .bcachefs_revision
+ git add .bcachefs_revision
.PHONE: update-commit-bcachefs-sources
update-commit-bcachefs-sources: update-bcachefs-sources
struct task_struct;
# define lock_acquire(l, s, t, r, c, n, i) do { } while (0)
-# define lock_release(l, n, i) do { } while (0)
+# define lock_release(l, i) do { } while (0)
# define lock_set_class(l, n, k, s, i) do { } while (0)
# define lock_set_subclass(l, s, i) do { } while (0)
# define lockdep_set_current_reclaim_state(g) do { } while (0)
__entry->buckets_moved, __entry->buckets_not_moved)
);
+TRACE_EVENT(transaction_restart_ip,
+ TP_PROTO(unsigned long caller, unsigned long ip),
+ TP_ARGS(caller, ip),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, caller )
+ __field(unsigned long, ip )
+ ),
+
+ TP_fast_assign(
+ __entry->caller = caller;
+ __entry->ip = ip;
+ ),
+
+ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip)
+);
+
DECLARE_EVENT_CLASS(transaction_restart,
TP_PROTO(unsigned long ip),
TP_ARGS(ip),
bch2_trans_update(trans, iter, &a->k_i,
BTREE_TRIGGER_NORUN);
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|flags);
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ flags);
err:
if (ret == -EINTR)
goto retry;
set_current_state(TASK_INTERRUPTIBLE);
spin_lock(&c->freelist_lock);
- for (i = 0; i < RESERVE_NR; i++)
+ for (i = 0; i < RESERVE_NR; i++) {
+
+ /*
+ * Don't strand buckets on the copygc freelist until
+ * after recovery is finished:
+ */
+ if (!test_bit(BCH_FS_STARTED, &c->flags) &&
+ i == RESERVE_MOVINGGC)
+ continue;
+
if (fifo_push(&ca->free[i], bucket)) {
fifo_pop(&ca->free_inc, bucket);
spin_unlock(&c->freelist_lock);
goto out;
}
+ }
if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) {
ca->allocator_state = ALLOCATOR_BLOCKED_FULL;
#undef pr_fmt
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
-#include <linux/stddef.h>
#include <linux/bug.h>
#include <linux/bio.h>
#include <linux/closure.h>
* inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
*/
-#include <linux/stddef.h>
#include <asm/types.h>
#include <asm/byteorder.h>
#include <linux/kernel.h>
const struct bkey_ops *ops;
struct bkey uk;
struct bkey_s u;
-
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bkey_swab_key(f, k);
-
- if (version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
-
- if (version < bcachefs_metadata_version_inode_btree_change &&
- btree_id == BTREE_ID_INODES) {
+ int i;
+
+ /*
+ * Do these operations in reverse order in the write path:
+ */
+
+ for (i = 0; i < 4; i++)
+ switch (!write ? i : 3 - i) {
+ case 0:
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_key(f, k);
+ break;
+ case 1:
+ if (version < bcachefs_metadata_version_bkey_renumber)
+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
+ break;
+ case 2:
+ if (version < bcachefs_metadata_version_inode_btree_change &&
+ btree_id == BTREE_ID_INODES) {
+ if (!bkey_packed(k)) {
+ struct bkey_i *u = packed_to_bkey(k);
+ swap(u->k.p.inode, u->k.p.offset);
+ } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
+ f->bits_per_field[BKEY_FIELD_OFFSET]) {
+ struct bkey_format tmp = *f, *in = f, *out = &tmp;
+
+ swap(tmp.bits_per_field[BKEY_FIELD_INODE],
+ tmp.bits_per_field[BKEY_FIELD_OFFSET]);
+ swap(tmp.field_offset[BKEY_FIELD_INODE],
+ tmp.field_offset[BKEY_FIELD_OFFSET]);
+
+ if (!write)
+ swap(in, out);
+
+ uk = __bch2_bkey_unpack_key(in, k);
+ swap(uk.p.inode, uk.p.offset);
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+ }
+ }
+ break;
+ case 3:
if (!bkey_packed(k)) {
- struct bkey_i *u = packed_to_bkey(k);
- swap(u->k.p.inode, u->k.p.offset);
- } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
- f->bits_per_field[BKEY_FIELD_OFFSET]) {
- struct bkey_format tmp = *f, *in = f, *out = &tmp;
-
- swap(tmp.bits_per_field[BKEY_FIELD_INODE],
- tmp.bits_per_field[BKEY_FIELD_OFFSET]);
- swap(tmp.field_offset[BKEY_FIELD_INODE],
- tmp.field_offset[BKEY_FIELD_OFFSET]);
-
- if (!write)
- swap(in, out);
-
- uk = __bch2_bkey_unpack_key(in, k);
- swap(uk.p.inode, uk.p.offset);
- BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
+ u = bkey_i_to_s(packed_to_bkey(k));
+ } else {
+ uk = __bch2_bkey_unpack_key(f, k);
+ u.k = &uk;
+ u.v = bkeyp_val(f, k);
}
- }
- if (!bkey_packed(k)) {
- u = bkey_i_to_s(packed_to_bkey(k));
- } else {
- uk = __bch2_bkey_unpack_key(f, k);
- u.k = &uk;
- u.v = bkeyp_val(f, k);
- }
+ if (big_endian != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_val(u);
- if (big_endian != CPU_BIG_ENDIAN)
- bch2_bkey_swab_val(u);
+ ops = &bch2_bkey_ops[k->type];
- ops = &bch2_bkey_ops[k->type];
-
- if (ops->compat)
- ops->compat(btree_id, version, big_endian, write, u);
+ if (ops->compat)
+ ops->compat(btree_id, version, big_endian, write, u);
+ break;
+ default:
+ BUG();
+ }
}
return c->opts.btree_node_size >> c->block_bits;
}
-#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
sizeof(u64), GFP_KERNEL);
- if (!c->usage_gc)
+ if (!c->usage_gc) {
+ bch_err(c, "error allocating c->usage_gc");
return -ENOMEM;
+ }
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets[1]) {
percpu_ref_put(&ca->ref);
+ bch_err(c, "error allocating ca->buckets[gc]");
return -ENOMEM;
}
ca->usage[1] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[1]) {
+ bch_err(c, "error allocating ca->usage[gc]");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
}
ret = bch2_ec_mem_alloc(c, true);
- if (ret)
+ if (ret) {
+ bch_err(c, "error allocating ec gc mem");
return ret;
+ }
percpu_down_write(&c->mark_lock);
return;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
btree_update_reserve_required(c, parent) + nr_old_nodes,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
struct btree_node *bn =
container_of(i, struct btree_node, keys);
/* These indicate that we read the wrong btree node: */
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ /* XXX endianness */
+ btree_err_on(bp->seq != bn->keys.seq,
+ BTREE_ERR_MUST_RETRY, c, b, NULL,
+ "incorrect sequence number (wrong btree node)");
+ }
+
btree_err_on(BTREE_NODE_ID(bn) != b->btree_id,
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect btree id");
* reflect that those writes were done and the data flushed from the
* journal:
*
+ * Also on journal error, the pending write may have updates that were
+ * never journalled (interior nodes, see btree_update_nodes_written()) -
+ * it's critical that we don't do the write in that case otherwise we
+ * will have updates visible that weren't in the journal:
+ *
* Make sure to update b->written so bch2_btree_init_next() doesn't
* break:
*/
struct btree_iter *linked;
trans_for_each_iter(iter->trans, linked)
- if (linked != iter &&
- linked->l[level].b == b &&
+ if (linked->l[level].b == b &&
btree_node_locked_type(linked, level) >= want) {
six_lock_increment(&b->lock, want);
return true;
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
- BUG_ON((as->nr_new_nodes || as->nr_pending) &&
- !bch2_journal_error(&c->journal));;
+ BUG_ON(as->nr_new_nodes || as->nr_pending);
if (as->reserve)
bch2_btree_reserve_put(c, as->reserve);
+ list_del(&as->unwritten_list);
list_del(&as->list);
closure_debug_destroy(&as->cl);
mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
+static inline bool six_trylock_intentwrite(struct six_lock *lock)
{
- struct bch_fs *c = as->c;
-
- while (as->nr_new_nodes) {
- struct btree *b = as->new_nodes[--as->nr_new_nodes];
+ if (!six_trylock_intent(lock))
+ return false;
- BUG_ON(b->will_make_reachable != (unsigned long) as);
- b->will_make_reachable = 0;
-
- /*
- * b->will_make_reachable prevented it from being written, so
- * write it now if it needs to be written:
- */
- btree_node_lock_type(c, b, SIX_LOCK_read);
- bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
- six_unlock_read(&b->lock);
+ if (!six_trylock_write(lock)) {
+ six_unlock_intent(lock);
+ return false;
}
- while (as->nr_pending)
- bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
- seq);
+ return true;
}
static void btree_update_nodes_written(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
+ struct btree *nodes_need_write[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES + 1];
+ unsigned nr_nodes_need_write;
struct journal_res res = { 0 };
struct bch_fs *c = as->c;
+ struct btree_root *r;
struct btree *b;
- struct bset *i;
int ret;
/*
mutex_lock(&c->btree_interior_update_lock);
as->nodes_written = true;
again:
+ nr_nodes_need_write = 0;
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
struct btree_update, unwritten_list);
if (!as || !as->nodes_written) {
}
b = as->b;
- if (b && !six_trylock_intent(&b->lock)) {
+ if (b && !six_trylock_intentwrite(&b->lock)) {
mutex_unlock(&c->btree_interior_update_lock);
+
btree_node_lock_type(c, b, SIX_LOCK_intent);
+ six_lock_write(&b->lock);
+
+ six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
+
mutex_lock(&c->btree_interior_update_lock);
goto again;
}
- list_del(&as->unwritten_list);
-
ret = bch2_journal_res_get(&c->journal, &res, as->journal_u64s,
+ JOURNAL_RES_GET_NONBLOCK|
JOURNAL_RES_GET_RESERVED);
- if (ret) {
- BUG_ON(!bch2_journal_error(&c->journal));
- /* can't unblock btree writes */
- goto free_update;
+ if (ret == -EAGAIN) {
+ unsigned u64s = as->journal_u64s;
+
+ if (b) {
+ six_unlock_write(&b->lock);
+ six_unlock_intent(&b->lock);
+ }
+
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ ret = bch2_journal_res_get(&c->journal, &res, u64s,
+ JOURNAL_RES_GET_CHECK|
+ JOURNAL_RES_GET_RESERVED);
+ if (!ret) {
+ mutex_lock(&c->btree_interior_update_lock);
+ goto again;
+ }
}
- {
+ if (!ret) {
struct journal_buf *buf = &c->journal.buf[res.idx];
struct jset_entry *entry = vstruct_idx(buf->data, res.offset);
res.offset += as->journal_u64s;
res.u64s -= as->journal_u64s;
memcpy_u64s(entry, as->journal_entries, as->journal_u64s);
+ } else {
+ /*
+ * On journal error we have to run most of the normal path so
+ * that shutdown works - unblocking btree node writes in
+ * particular and writing them if needed - except for
+ * journalling the update:
+ */
+
+ BUG_ON(!bch2_journal_error(&c->journal));
}
switch (as->mode) {
BUG();
case BTREE_INTERIOR_UPDATING_NODE:
/* @b is the node we did the final insert into: */
- BUG_ON(!res.ref);
- six_lock_write(&b->lock);
+ /*
+ * On failure to get a journal reservation, we still have to
+ * unblock the write and allow most of the write path to happen
+ * so that shutdown works, but the i->journal_seq mechanism
+ * won't work to prevent the btree write from being visible (we
+ * didn't get a journal sequence number) - instead
+ * __bch2_btree_node_write() doesn't do the actual write if
+ * we're in journal error state:
+ */
+
list_del(&as->write_blocked_list);
- i = btree_bset_last(b);
- i->journal_seq = cpu_to_le64(
- max(res.seq,
- le64_to_cpu(i->journal_seq)));
+ if (!ret) {
+ struct bset *i = btree_bset_last(b);
+
+ i->journal_seq = cpu_to_le64(
+ max(res.seq,
+ le64_to_cpu(i->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, res.seq);
+ }
+
+ nodes_need_write[nr_nodes_need_write++] = b;
- bch2_btree_add_journal_pin(c, b, res.seq);
six_unlock_write(&b->lock);
+ six_unlock_intent(&b->lock);
break;
case BTREE_INTERIOR_UPDATING_AS:
BUG_ON(b);
break;
- case BTREE_INTERIOR_UPDATING_ROOT: {
- struct btree_root *r = &c->btree_roots[as->btree_id];
+ case BTREE_INTERIOR_UPDATING_ROOT:
+ r = &c->btree_roots[as->btree_id];
BUG_ON(b);
mutex_unlock(&c->btree_root_lock);
break;
}
- }
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_res_put(&c->journal, &res);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
-free_update:
- /* Do btree write after dropping journal res: */
- if (b) {
- /*
- * b->write_blocked prevented it from being written, so
- * write it now if it needs to be written:
- */
- btree_node_write_if_need(c, b, SIX_LOCK_intent);
- six_unlock_intent(&b->lock);
+
+ while (as->nr_new_nodes) {
+ b = as->new_nodes[--as->nr_new_nodes];
+
+ BUG_ON(b->will_make_reachable != (unsigned long) as);
+ b->will_make_reachable = 0;
+
+ nodes_need_write[nr_nodes_need_write++] = b;
}
- if (!ret)
- btree_update_nodes_reachable(as, res.seq);
+ while (as->nr_pending)
+ bch2_btree_node_free_ondisk(c,
+ &as->pending[--as->nr_pending], res.seq);
__bch2_btree_update_free(as);
/*
* nodes to be writeable:
*/
closure_wake_up(&c->btree_interior_update_wait);
+
+ /*
+ * Can't take btree node locks while holding btree_interior_update_lock:
+ * */
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ /* Do btree writes after dropping journal res/locks: */
+ while (nr_nodes_need_write) {
+ b = nodes_need_write[--nr_nodes_need_write];
+
+ btree_node_lock_type(c, b, SIX_LOCK_read);
+ bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
+ six_unlock_read(&b->lock);
+ }
+
+ mutex_lock(&c->btree_interior_update_lock);
goto again;
}
}
struct btree_update *
-bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id,
unsigned nr_nodes, unsigned flags,
struct closure *cl)
{
+ struct bch_fs *c = trans->c;
+ struct journal_preres journal_preres = { 0 };
struct btree_reserve *reserve;
struct btree_update *as;
int ret;
+ ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ BTREE_UPDATE_JOURNAL_RES,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (ret == -EAGAIN) {
+ if (flags & BTREE_INSERT_NOUNLOCK)
+ return ERR_PTR(-EINTR);
+
+ bch2_trans_unlock(trans);
+
+ ret = bch2_journal_preres_get(&c->journal, &journal_preres,
+ BTREE_UPDATE_JOURNAL_RES, 0);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (!bch2_trans_relock(trans)) {
+ bch2_journal_preres_put(&c->journal, &journal_preres);
+ return ERR_PTR(-EINTR);
+ }
+ }
+
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
- if (IS_ERR(reserve))
+ if (IS_ERR(reserve)) {
+ bch2_journal_preres_put(&c->journal, &journal_preres);
return ERR_CAST(reserve);
+ }
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
memset(as, 0, sizeof(*as));
as->btree_id = id;
as->reserve = reserve;
INIT_LIST_HEAD(&as->write_blocked_list);
+ INIT_LIST_HEAD(&as->unwritten_list);
+ as->journal_preres = journal_preres;
bch2_keylist_init(&as->parent_keys, as->inline_keys);
- ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
- ARRAY_SIZE(as->journal_entries), 0);
- if (ret) {
- bch2_btree_reserve_put(c, reserve);
- closure_debug_destroy(&as->cl);
- mempool_free(as, &c->btree_interior_update_pool);
- return ERR_PTR(ret);
- }
-
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
/* Hack, because gc and splitting nodes doesn't mix yet: */
if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
!down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK)
+ if (flags & BTREE_INSERT_NOUNLOCK) {
+ trace_transaction_restart_ip(trans->ip, _THIS_IP_);
return -EINTR;
+ }
bch2_trans_unlock(trans);
down_read(&c->gc_lock);
goto out;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(trans, iter->btree_id,
btree_update_reserve_required(c, b), flags,
!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
if (IS_ERR(as)) {
BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
bch2_trans_unlock(trans);
ret = -EINTR;
+
+ trace_transaction_restart_ip(trans->ip, _THIS_IP_);
}
goto out;
}
goto err_unlock;
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(trans, iter->btree_id,
btree_update_reserve_required(c, parent) + 1,
+ flags|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE,
!(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
struct btree *n, *parent = btree_node_parent(iter, b);
struct btree_update *as;
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
(parent
? btree_update_reserve_required(c, parent)
: 0) + 1,
new_hash = bch2_btree_node_mem_alloc(c);
}
- as = bch2_btree_update_start(c, iter->btree_id,
+ as = bch2_btree_update_start(iter->trans, iter->btree_id,
parent ? btree_update_reserve_required(c, parent) : 0,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
+#define BTREE_UPDATE_JOURNAL_RES \
+ ((BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2)
+
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
unsigned nr_new_nodes;
unsigned journal_u64s;
- u64 journal_entries[
- (BKEY_BTREE_PTR_U64s_MAX + 1) * (BTREE_MAX_DEPTH - 1) * 2];
+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
-bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned,
unsigned, struct closure *);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
+#include <crypto/skcipher.h>
#include <keys/user-type.h>
static u64 bch2_checksum_init(unsigned type)
BUG();
}
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ void *expected_start = NULL;
+
+ __bio_for_each_bvec(bv, bio, iter, start) {
+ if (expected_start &&
+ expected_start != page_address(bv.bv_page) + bv.bv_offset)
+ return false;
+
+ expected_start = page_address(bv.bv_page) +
+ bv.bv_offset + bv.bv_len;
+ }
+
+ return true;
+}
+
static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
struct bvec_iter start, int rw)
{
unsigned nr_pages = 0;
struct page *stack_pages[16];
struct page **pages = NULL;
- bool first = true;
- unsigned prev_end = PAGE_SIZE;
void *data;
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
-#ifndef CONFIG_HIGHMEM
- __bio_for_each_bvec(bv, bio, iter, start) {
- if (bv.bv_len == start.bi_size)
- return (struct bbuf) {
- .b = page_address(bv.bv_page) + bv.bv_offset,
- .type = BB_NONE, .rw = rw
- };
- }
-#endif
+ if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+ bio_phys_contig(bio, start))
+ return (struct bbuf) {
+ .b = page_address(bio_iter_page(bio, start)) +
+ bio_iter_offset(bio, start),
+ .type = BB_NONE, .rw = rw
+ };
+
+ /* check if we can map the pages contiguously: */
__bio_for_each_segment(bv, bio, iter, start) {
- if ((!first && bv.bv_offset) ||
- prev_end != PAGE_SIZE)
+ if (iter.bi_size != start.bi_size &&
+ bv.bv_offset)
+ goto bounce;
+
+ if (bv.bv_len < iter.bi_size &&
+ bv.bv_offset + bv.bv_len < PAGE_SIZE)
goto bounce;
- prev_end = bv.bv_offset + bv.bv_len;
nr_pages++;
}
}
case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_DCtx *ctx;
- size_t len;
+ size_t real_src_len = le32_to_cpup(src_data.b);
+
+ if (real_src_len > src_len - 4)
+ goto err;
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
- src_len = le32_to_cpup(src_data.b);
-
- len = ZSTD_decompressDCtx(ctx,
+ ret = ZSTD_decompressDCtx(ctx,
dst_data, dst_len,
- src_data.b + 4, src_len);
+ src_data.b + 4, real_src_len);
mempool_free(workspace, &c->decompress_workspace);
- if (len != dst_len)
+ if (ret != dst_len)
goto err;
break;
}
if (ret)
goto err;
- if (dst_data.type != BB_NONE)
+ if (dst_data.type != BB_NONE &&
+ dst_data.type != BB_VMAP)
memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
err:
bio_unmap_or_unbounce(c, dst_data);
memset(dst_data.b + *dst_len, 0, pad);
*dst_len += pad;
- if (dst_data.type != BB_NONE)
+ if (dst_data.type != BB_NONE &&
+ dst_data.type != BB_VMAP)
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t max_extent = c->sb.encoded_extent_max << 9;
- size_t order = get_order(max_extent);
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
- 1, order);
+ 1, max_extent);
if (ret)
goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
- 1, order);
+ 1, max_extent);
if (ret)
goto out;
}
unsigned offset,
struct bpos *end,
unsigned *nr_iters,
- unsigned max_iters,
- bool overwrite)
+ unsigned max_iters)
{
- int ret = 0;
+ int ret = 0, ret2 = 0;
- /*
- * The extent update path requires an _additional_ iterator for each
- * extent we're inserting and overwriting:
- */
- *nr_iters += 1;
if (*nr_iters >= max_iters) {
*end = bpos_min(*end, k.k->p);
ret = 1;
for_each_btree_key(trans, iter,
BTREE_ID_REFLINK, POS(0, idx + offset),
- BTREE_ITER_SLOTS, r_k, ret) {
+ BTREE_ITER_SLOTS, r_k, ret2) {
if (bkey_cmp(bkey_start_pos(r_k.k),
POS(0, idx + sectors)) >= 0)
break;
+ /* extent_update_to_keys(), for the reflink_v update */
+ *nr_iters += 1;
+
*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
if (*nr_iters >= max_iters) {
struct bpos pos = bkey_start_pos(k.k);
- pos.offset += r_k.k->p.offset - idx;
+ pos.offset += min_t(u64, k.k->size,
+ r_k.k->p.offset - idx);
*end = bpos_min(*end, pos);
ret = 1;
}
}
- return ret;
+ return ret2 ?: ret;
}
#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
*end = bpos_min(insert->k.p, b->key.k.p);
+ /* extent_update_to_keys(): */
+ nr_iters += 1;
+
ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
- &nr_iters, EXTENT_ITERS_MAX / 2, false);
+ &nr_iters, EXTENT_ITERS_MAX / 2);
if (ret < 0)
return ret;
offset = bkey_start_offset(&insert->k) -
bkey_start_offset(k.k);
+ /* extent_handle_overwrites(): */
+ switch (bch2_extent_overlap(&insert->k, k.k)) {
+ case BCH_EXTENT_OVERLAP_ALL:
+ case BCH_EXTENT_OVERLAP_FRONT:
+ nr_iters += 1;
+ break;
+ case BCH_EXTENT_OVERLAP_BACK:
+ case BCH_EXTENT_OVERLAP_MIDDLE:
+ nr_iters += 2;
+ break;
+ }
+
ret = count_iters_for_insert(trans, k, offset, end,
- &nr_iters, EXTENT_ITERS_MAX, true);
+ &nr_iters, EXTENT_ITERS_MAX);
if (ret)
break;
return;
bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
- !bch2_bkey_replicas_marked(c, k, false), c,
+ !bch2_bkey_replicas_marked_locked(c, k, false), c,
"btree key bad (replicas not marked in superblock):\n%s",
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
- w->io->op.wbio.bio.bi_iter.bi_size >= (256U << 20) ||
+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+ (BIO_MAX_PAGES * PAGE_SIZE) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);
goto loop;
while (1) {
+ size_t extra = dio->iter.count -
+ min(BIO_MAX_PAGES * PAGE_SIZE, dio->iter.count);
+
if (kthread)
use_mm(dio->mm);
BUG_ON(current->faults_disabled_mapping);
current->faults_disabled_mapping = mapping;
+ /*
+ * Don't issue more than 2MB at once, the bcachefs io path in
+ * io.c can't bounce more than that:
+ */
+
+ dio->iter.count -= extra;
ret = bio_iov_iter_get_pages(bio, &dio->iter);
+ dio->iter.count += extra;
current->faults_disabled_mapping = NULL;
if (kthread)
__bch2_write_index(op);
- if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
+ if (!(op->flags & BCH_WRITE_DONE)) {
+ continue_at(cl, __bch2_write, index_update_wq(op));
+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
bch2_journal_flush_seq_async(&c->journal,
*op_journal_seq(op),
cl);
if (ret < 0)
goto err;
- if (ret)
+ if (ret) {
skip_put = false;
+ } else {
+ /*
+ * for the skip_put optimization this has to be set
+ * before we submit the bio:
+ */
+ op->flags |= BCH_WRITE_DONE;
+ }
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
return;
err:
op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
return;
flush_io:
+ /*
+ * If the write can't all be submitted at once, we generally want to
+ * block synchronously as that signals backpressure to the caller.
+ *
+ * However, if we're running out of a workqueue, we can't block here
+ * because we'll be blocking other work items from completing:
+ */
+ if (current->flags & PF_WQ_WORKER) {
+ continue_at(cl, bch2_write_index, index_update_wq(op));
+ return;
+ }
+
closure_sync(cl);
if (!bch2_keylist_empty(&op->insert_keys)) {
__bch2_write_index(op);
if (op->error) {
+ op->flags |= BCH_WRITE_DONE;
continue_at_nobarrier(cl, bch2_write_done, NULL);
return;
}
bch2_keylist_push(&op->insert_keys);
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+ op->flags |= BCH_WRITE_DONE;
+
continue_at_nobarrier(cl, bch2_write_index, NULL);
return;
err:
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 10),
BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 11),
+ BCH_WRITE_DONE = (1 << 12),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
if (!res->ref)
return;
- lock_release(&j->res_map, 0, _THIS_IP_);
+ lock_release(&j->res_map, _THIS_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
return ret;
out:
if (!(flags & JOURNAL_RES_GET_CHECK)) {
- lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
+ lock_acquire_shared(&j->res_map, 0,
+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
+ NULL, _THIS_IP_);
EBUG_ON(!res->ref);
}
return 0;
spin_unlock(&j->lock);
}
-void __bch2_journal_pin_add(struct journal *j, u64 seq,
+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
- spin_lock(&j->lock);
-
__journal_pin_drop(j, pin);
BUG_ON(!atomic_read(&pin_list->count));
pin->flush = flush_fn;
list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
+}
+void __bch2_journal_pin_add(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ spin_lock(&j->lock);
+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
spin_unlock(&j->lock);
/*
struct journal_entry_pin *src,
journal_pin_flush_fn flush_fn)
{
+ spin_lock(&j->lock);
+
if (journal_pin_active(src) &&
(!journal_pin_active(dst) || src->seq < dst->seq))
- __bch2_journal_pin_add(j, src->seq, dst, flush_fn);
+ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
+
+ spin_unlock(&j->lock);
}
/**
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k;
struct bkey_i *insert;
- struct bkey_i_extent *new =
- bkey_i_to_extent(bch2_keylist_front(keys));
+ struct bkey_i_extent *new;
BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
bool did_work = false;
int nr;
+ bch2_trans_reset(&trans, 0);
+
+ k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
- if (ret)
+ if (ret) {
+ if (ret == -EINTR)
+ continue;
break;
+ }
+
+ new = bkey_i_to_extent(bch2_keylist_front(keys));
if (bversion_cmp(k.k->version, new->k.version) ||
!bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
return ret;
}
-static int bch2_set_quota(struct super_block *sb, struct kqid qid,
- struct qc_dqblk *qdq)
+static int bch2_set_quota_trans(struct btree_trans *trans,
+ struct bkey_i_quota *new_quota,
+ struct qc_dqblk *qdq)
{
- struct bch_fs *c = sb->s_fs_info;
- struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
- struct bkey_i_quota new_quota;
int ret;
- if (sb->s_flags & SB_RDONLY)
- return -EROFS;
-
- bkey_quota_init(&new_quota.k_i);
- new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
-
- bch2_trans_init(&trans, c, 0, 0);
-
- iter = bch2_trans_get_iter(&trans, BTREE_ID_QUOTAS, new_quota.k.p,
+ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(iter);
if (unlikely(ret))
return ret;
- switch (k.k->type) {
- case KEY_TYPE_quota:
- new_quota.v = *bkey_s_c_to_quota(k).v;
- break;
- }
+ if (k.k->type == KEY_TYPE_quota)
+ new_quota->v = *bkey_s_c_to_quota(k).v;
if (qdq->d_fieldmask & QC_SPC_SOFT)
- new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
if (qdq->d_fieldmask & QC_SPC_HARD)
- new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
if (qdq->d_fieldmask & QC_INO_SOFT)
- new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
if (qdq->d_fieldmask & QC_INO_HARD)
- new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
+
+ return bch2_trans_update(trans, iter, &new_quota->k_i, 0);
+}
- bch2_trans_update(&trans, iter, &new_quota.k_i, 0);
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
+ struct qc_dqblk *qdq)
+{
+ struct bch_fs *c = sb->s_fs_info;
+ struct btree_trans trans;
+ struct bkey_i_quota new_quota;
+ int ret;
- ret = bch2_trans_commit(&trans, NULL, NULL, 0);
+ if (sb->s_flags & SB_RDONLY)
+ return -EROFS;
- bch2_trans_exit(&trans);
+ bkey_quota_init(&new_quota.k_i);
+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- if (ret)
- return ret;
+ bch2_trans_init(&trans, c, 0, 0);
- ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
+
+ bch2_trans_exit(&trans);
return ret;
}
prev_run_time;
if (w.dev_most_full_percent < 20 && throttle > 0) {
- r->state = REBALANCE_THROTTLED;
r->throttled_until_iotime = io_start +
div_u64(w.dev_most_full_capacity *
(20 - w.dev_most_full_percent),
50);
- r->throttled_until_cputime = start + throttle;
- bch2_kthread_io_clock_wait(clock,
- r->throttled_until_iotime,
- throttle);
- continue;
+ if (atomic_long_read(&clock->now) + clock->max_slop <
+ r->throttled_until_iotime) {
+ r->throttled_until_cputime = start + throttle;
+ r->state = REBALANCE_THROTTLED;
+
+ bch2_kthread_io_clock_wait(clock,
+ r->throttled_until_iotime,
+ throttle);
+ continue;
+ }
}
/* minimum 1 mb/sec: */
GFP_NOIO)) ||
!(new_scratch = kmalloc(bytes, GFP_NOIO)) ||
(c->usage_gc &&
- !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO))))
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) {
+ bch_err(c, "error updating replicas table: memory allocation failure");
goto err;
+ }
if (c->usage_base)
__replicas_table_update(new_base, new_r,
struct bch_replicas_entry *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
- int ret = -ENOMEM;
+ int ret = 0;
verify_replicas_entry(new_entry);
swap(new_gc, c->replicas_gc);
percpu_up_write(&c->mark_lock);
out:
- ret = 0;
-err:
mutex_unlock(&c->sb_lock);
kfree(new_r.entries);
kfree(new_gc.entries);
return ret;
+err:
+ bch_err(c, "error adding replicas entry: memory allocation failure");
+ ret = -ENOMEM;
+ goto out;
}
int bch2_mark_replicas(struct bch_fs *c,
GFP_NOIO);
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
+ bch_err(c, "error allocating c->replicas_gc");
return -ENOMEM;
}
nr = READ_ONCE(c->replicas.nr);
new.entry_size = READ_ONCE(c->replicas.entry_size);
new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
- if (!new.entries)
+ if (!new.entries) {
+ bch_err(c, "error allocating c->replicas_gc");
return -ENOMEM;
+ }
mutex_lock(&c->sb_lock);
percpu_down_write(&c->mark_lock);
* https://131002.net/siphash/
*/
-#include <linux/stddef.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/bitops.h>
if (bch2_fs_init_fault("fs_start"))
goto err;
+ set_bit(BCH_FS_STARTED, &c->flags);
+
if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c);
} else {
goto err;
}
- set_bit(BCH_FS_STARTED, &c->flags);
print_mount_opts(c);
ret = 0;
out:
#endif
#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l) lock_release(l, 0, _RET_IP_)
+#define six_release(l) lock_release(l, _RET_IP_)
struct six_lock_vals {
/* Value we add to the lock in order to take the lock: */