-3592e42edfaed6a66470fb6a456a5895243ef2f4
+fd637ebda030609b15a473f01f1ef54bbe818f27
x(new_extent_overwrite, 9) \
x(incompressible, 10) \
x(btree_ptr_v2, 11) \
- x(extents_above_btree_updates, 12)
+ x(extents_above_btree_updates, 12) \
+ x(btree_updates_journalled, 13)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
static inline void bkey_reassemble(struct bkey_i *dst,
struct bkey_s_c src)
{
- BUG_ON(bkey_packed(src.k));
dst->k = *src.k;
- memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
- if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+ if (bkey_cmp(k.k->p, b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return nr;
}
-static void extent_sort_advance_prev(struct bkey_format *f,
- struct btree_nr_keys *nr,
- struct bkey_packed *start,
- struct bkey_packed **prev)
-{
- if (*prev) {
- bch2_bkey_pack(*prev, (void *) *prev, f);
-
- btree_keys_account_key_add(nr, 0, *prev);
- *prev = bkey_next(*prev);
- } else {
- *prev = start;
- }
-}
-
static void extent_sort_append(struct bch_fs *c,
struct bkey_format *f,
struct btree_nr_keys *nr,
- struct bkey_packed *start,
- struct bkey_packed **prev,
+ struct bkey_packed **out,
struct bkey_s k)
{
- if (bkey_whiteout(k.k))
- return;
-
- /*
- * prev is always unpacked, for key merging - until right before we
- * advance it:
- */
+ if (!bkey_whiteout(k.k)) {
+ if (!bch2_bkey_pack_key(*out, k.k, f))
+ memcpy_u64s_small(*out, k.k, BKEY_U64s);
- if (*prev &&
- bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
- BCH_MERGE_MERGE)
- return;
+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
- extent_sort_advance_prev(f, nr, start, prev);
-
- bkey_reassemble((void *) *prev, k.s_c);
+ btree_keys_account_key_add(nr, 0, *out);
+ *out = bkey_next(*out);
+ }
}
/* Sort + repack in a new format: */
return nr;
}
-/* Sort, repack, and merge: */
+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
struct btree_nr_keys
bch2_sort_repack_merge(struct bch_fs *c,
struct bset *dst, struct btree *src,
struct bkey_format *out_f,
bool filter_whiteouts)
{
- struct bkey_packed *prev = NULL, *k_packed;
+ struct bkey_packed *out = vstruct_last(dst), *k_packed;
struct bkey_on_stack k;
struct btree_nr_keys nr;
bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
continue;
- extent_sort_append(c, out_f, &nr, vstruct_last(dst),
- &prev, bkey_i_to_s(k.k));
+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
}
- extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&k, c);
return nr;
}
struct btree *b = iter->b;
struct bkey_format *f = &b->format;
struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
- struct bkey_packed *prev = NULL;
+ struct bkey_packed *out = dst->start;
struct bkey l_unpacked, r_unpacked;
struct bkey_s l, r;
struct btree_nr_keys nr;
l = __bkey_disassemble(b, _l->k, &l_unpacked);
if (iter->used == 1) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ extent_sort_append(c, f, &nr, &out, l);
extent_iter_advance(iter, 0);
continue;
}
/* If current key and next key don't overlap, just append */
if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
- extent_sort_append(c, f, &nr, dst->start, &prev, l);
+ extent_sort_append(c, f, &nr, &out, l);
extent_iter_advance(iter, 0);
continue;
}
__sort_iter_sift(iter, 0,
extent_sort_fix_overlapping_cmp);
- extent_sort_append(c, f, &nr, dst->start,
- &prev, bkey_i_to_s(split.k));
+ extent_sort_append(c, f, &nr, &out,
+ bkey_i_to_s(split.k));
} else {
bch2_cut_back_s(bkey_start_pos(r.k), l);
extent_save(b, _l->k, l.k);
}
}
- extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
- dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
bkey_on_stack_exit(&split, c);
return nr;
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_iter *iter,
const struct bkey_i *k,
+ enum btree_id btree_id,
unsigned level,
enum six_lock_type lock_type,
bool sync)
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
- if (!bch2_btree_node_relock(iter, level + 1))
+ if (iter && !bch2_btree_node_relock(iter, level + 1))
return ERR_PTR(-EINTR);
b = bch2_btree_node_mem_alloc(c);
return b;
bkey_copy(&b->key, k);
- if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
*
* XXX: ideally should be dropping all btree node locks here
*/
- if (btree_node_read_locked(iter, level + 1))
+ if (iter && btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
bch2_btree_node_read(c, b, sync);
* else we could read in a btree node from disk that's been
* freed:
*/
- b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ level, lock_type, true);
/* We raced and found the btree node in the cache */
if (!b)
return b;
}
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+ const struct bkey_i *k,
+ enum btree_id btree_id,
+ unsigned level)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ struct btree *b;
+ struct bset_tree *t;
+
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+ b = btree_node_mem_ptr(k);
+ if (b)
+ goto lock_node;
+retry:
+ b = btree_cache_find(bc, k);
+ if (unlikely(!b)) {
+ b = bch2_btree_node_fill(c, NULL, k, btree_id,
+ level, SIX_LOCK_read, true);
+
+ /* We raced and found the btree node in the cache */
+ if (!b)
+ goto retry;
+
+ if (IS_ERR(b))
+ return b;
+ } else {
+lock_node:
+ six_lock_read(&b->lock);
+
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+ b->btree_id != btree_id ||
+ b->level != level)) {
+ six_unlock_read(&b->lock);
+ goto retry;
+ }
+ }
+
+ /* XXX: waiting on IO with btree locks held: */
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+ TASK_UNINTERRUPTIBLE);
+
+ prefetch(b->aux_data);
+
+ for_each_bset(b, t) {
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+ prefetch(p + L1_CACHE_BYTES * 0);
+ prefetch(p + L1_CACHE_BYTES * 1);
+ prefetch(p + L1_CACHE_BYTES * 2);
+ }
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
+
+ if (unlikely(btree_node_read_error(b))) {
+ six_unlock_read(&b->lock);
+ return ERR_PTR(-EIO);
+ }
+
+ EBUG_ON(b->btree_id != btree_id ||
+ BTREE_NODE_LEVEL(b->data) != level ||
+ bkey_cmp(b->data->max_key, k->k.p));
+
+ return b;
+}
+
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
if (b)
return;
- bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+ bch2_btree_node_fill(c, iter, k, iter->btree_id,
+ level, SIX_LOCK_read, false);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
const struct bkey_i *, unsigned,
enum six_lock_type);
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+ enum btree_id, unsigned);
+
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
return ret;
}
-static bool pos_in_journal_keys(struct journal_keys *journal_keys,
- enum btree_id id, struct bpos pos)
-{
- struct journal_key *k = journal_key_search(journal_keys, id, pos);
-
- return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
-}
-
static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
- struct journal_keys *journal_keys, bool initial)
+ bool initial)
{
struct btree_node_iter iter;
struct bkey unpacked;
for_each_btree_node_key_unpack(b, k, &iter,
&unpacked) {
- if (!b->level && journal_keys &&
- pos_in_journal_keys(journal_keys, b->btree_id, k.k->p))
- continue;
-
bch2_bkey_debugcheck(c, b, k);
ret = bch2_gc_mark_key(c, k, max_stale, initial);
}
static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
- struct journal_keys *journal_keys,
bool initial, bool metadata_only)
{
struct btree_trans trans;
gc_pos_set(c, gc_pos_btree_node(b));
- ret = btree_gc_mark_node(c, b, &max_stale,
- journal_keys, initial);
+ ret = btree_gc_mark_node(c, b, &max_stale, initial);
if (ret)
break;
return ret;
}
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+ struct journal_keys *journal_keys,
+ unsigned target_depth)
+{
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
+ u8 max_stale = 0;
+ int ret = 0;
+
+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_debugcheck(c, b, k);
+
+ ret = bch2_gc_mark_key(c, k, &max_stale, true);
+ if (ret)
+ break;
+
+ if (b->level > target_depth) {
+ struct btree *child;
+ BKEY_PADDED(k) tmp;
+
+ bkey_reassemble(&tmp.k, k);
+
+ child = bch2_btree_node_get_noiter(c, &tmp.k,
+ b->btree_id, b->level - 1);
+ ret = PTR_ERR_OR_ZERO(child);
+ if (ret)
+ break;
+
+ bch2_gc_btree_init_recurse(c, child,
+ journal_keys, target_depth);
+ six_unlock_read(&child->lock);
+ }
+
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
+ return ret;
+}
+
+static int bch2_gc_btree_init(struct bch_fs *c,
+ struct journal_keys *journal_keys,
+ enum btree_id btree_id,
+ bool metadata_only)
+{
+ struct btree *b;
+ unsigned target_depth = metadata_only ? 1
+ : expensive_debug_checks(c) ? 0
+ : !btree_node_type_needs_gc(btree_id) ? 1
+ : 0;
+ u8 max_stale = 0;
+ int ret = 0;
+
+ b = c->btree_roots[btree_id].b;
+
+ if (btree_node_fake(b))
+ return 0;
+
+ six_lock_read(&b->lock);
+ if (b->level >= target_depth)
+ ret = bch2_gc_btree_init_recurse(c, b,
+ journal_keys, target_depth);
+
+ if (!ret)
+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+ &max_stale, true);
+ six_unlock_read(&b->lock);
+
+ return ret;
+}
+
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
return (int) btree_id_to_gc_phase(l) -
for (i = 0; i < BTREE_ID_NR; i++) {
enum btree_id id = ids[i];
- enum btree_node_type type = __btree_node_type(0, id);
-
- int ret = bch2_gc_btree(c, id, journal_keys,
- initial, metadata_only);
+ int ret = initial
+ ? bch2_gc_btree_init(c, journal_keys,
+ id, metadata_only)
+ : bch2_gc_btree(c, id, initial, metadata_only);
if (ret)
return ret;
-
- if (journal_keys && !metadata_only &&
- btree_node_type_needs_gc(type)) {
- struct journal_key *j;
- u8 max_stale;
- int ret;
-
- for_each_journal_key(*journal_keys, j)
- if (j->btree_id == id) {
- ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
- &max_stale, initial);
- if (ret)
- return ret;
- }
- }
}
return 0;
closure_put(&((struct btree_update *) new)->cl);
bch2_journal_pin_drop(&c->journal, &w->journal);
- closure_wake_up(&w->wait);
}
static void btree_node_write_done(struct bch_fs *c, struct btree *b)
wbio->wbio.bio.bi_end_io = btree_node_write_endio;
wbio->wbio.bio.bi_private = b;
- if (b->level || !b->written)
- wbio->wbio.bio.bi_opf |= REQ_FUA;
-
bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
/*
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos) {
unsigned long flags = READ_ONCE(b->flags);
- unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
if (!(flags & (1 << BTREE_NODE_dirty)))
continue;
- pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
b,
(flags & (1 << BTREE_NODE_dirty)) != 0,
(flags & (1 << BTREE_NODE_need_write)) != 0,
b->written,
!list_empty_careful(&b->write_blocked),
b->will_make_reachable != 0,
- b->will_make_reachable & 1,
- b->writes[ idx].wait.list.first != NULL,
- b->writes[!idx].wait.list.first != NULL);
+ b->will_make_reachable & 1);
}
rcu_read_unlock();
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+ enum six_lock_type lock_held)
{
while (b->written &&
btree_node_need_write(b) &&
btree_node_may_write(b)) {
if (!btree_node_write_in_flight(b)) {
- bch2_btree_node_write(c, b, SIX_LOCK_read);
+ bch2_btree_node_write(c, b, lock_held);
break;
}
six_unlock_read(&b->lock);
btree_node_wait_on_io(b);
- btree_node_lock_type(c, b, SIX_LOCK_read);
+ btree_node_lock_type(c, b, lock_held);
}
}
new |= (1 << BTREE_NODE_need_write); \
} while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
\
- btree_node_write_if_need(_c, _b); \
+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
goto retry_all;
}
- ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
+ if (hweight64(trans->iters_live) > 1)
+ ret = -EINTR;
+ else
+ trans_for_each_iter(trans, iter)
+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
+ ret = -EINTR;
+ break;
+ }
out:
bch2_btree_cache_cannibalize_unlock(c);
return ret;
struct btree_write {
struct journal_entry_pin journal;
- struct closure_waitlist wait;
};
struct btree_alloc {
return iter->flags & BTREE_ITER_TYPE;
}
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+ return iter->l + iter->level;
+}
+
struct btree_insert_entry {
unsigned trigger_flags;
unsigned trans_triggers_run:1;
struct btree_root {
struct btree *b;
- struct btree_update *as;
-
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
enum btree_insert_flags {
__BTREE_INSERT_NOUNLOCK,
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
}
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
- struct pending_btree_node_free *pending)
+ struct pending_btree_node_free *pending,
+ u64 journal_seq)
{
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+ 0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
- 0, 0, NULL, 0,
+ 0, 0, NULL, journal_seq,
BTREE_TRIGGER_OVERWRITE|
BTREE_TRIGGER_GC);
}
{
struct bch_fs *c = as->c;
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+ bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
- BUG_ON(as->nr_new_nodes);
- BUG_ON(as->nr_pending);
+ BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+ !bch2_journal_error(&c->journal));;
if (as->reserve)
bch2_btree_reserve_put(c, as->reserve);
mutex_unlock(&c->btree_interior_update_lock);
}
-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
struct bch_fs *c = as->c;
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
mutex_lock(&c->btree_interior_update_lock);
while (as->nr_new_nodes) {
}
while (as->nr_pending)
- bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+ bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+ seq);
mutex_unlock(&c->btree_interior_update_lock);
-
- closure_wake_up(&as->wait);
-
- bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
- struct bch_fs *c = as->c;
- int ret;
-
- ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
- if (ret == -EAGAIN) {
- continue_at(cl, btree_update_wait_on_journal, system_wq);
- return;
- }
- if (ret < 0)
- goto err;
-
- bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
- continue_at(cl, btree_update_nodes_reachable, system_wq);
}
static void btree_update_nodes_written(struct closure *cl)
{
struct btree_update *as = container_of(cl, struct btree_update, cl);
+ struct journal_res res = { 0 };
struct bch_fs *c = as->c;
struct btree *b;
+ struct bset *i;
+ struct bkey_i *k;
+ unsigned journal_u64s = 0;
+ int ret;
/*
* We did an update to a parent node where the pointers we added pointed
*/
mutex_lock(&c->btree_interior_update_lock);
as->nodes_written = true;
-retry:
+again:
as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
struct btree_update, unwritten_list);
if (!as || !as->nodes_written) {
return;
}
+ b = as->b;
+ if (b && !six_trylock_intent(&b->lock)) {
+ mutex_unlock(&c->btree_interior_update_lock);
+ btree_node_lock_type(c, b, SIX_LOCK_intent);
+ six_unlock_intent(&b->lock);
+ goto out;
+ }
+
+ journal_u64s = 0;
+
+ if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+ for_each_keylist_key(&as->parent_keys, k)
+ journal_u64s += jset_u64s(k->k.u64s);
+
+ ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+ JOURNAL_RES_GET_RESERVED);
+ if (ret) {
+ BUG_ON(!bch2_journal_error(&c->journal));
+ /* can't unblock btree writes */
+ goto free_update;
+ }
+
+ if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+ for_each_keylist_key(&as->parent_keys, k)
+ bch2_journal_add_entry(&c->journal, &res,
+ BCH_JSET_ENTRY_btree_keys,
+ as->btree_id,
+ as->level,
+ k, k->k.u64s);
+
switch (as->mode) {
case BTREE_INTERIOR_NO_UPDATE:
BUG();
case BTREE_INTERIOR_UPDATING_NODE:
- /* The usual case: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- six_unlock_read(&b->lock);
- mutex_lock(&c->btree_interior_update_lock);
- goto retry;
- }
-
- BUG_ON(!btree_node_dirty(b));
- closure_wait(&btree_current_write(b)->wait, &as->cl);
+ /* @b is the node we did the final insert into: */
+ BUG_ON(!res.ref);
+ six_lock_write(&b->lock);
list_del(&as->write_blocked_list);
- /*
- * for flush_held_btree_writes() waiting on updates to flush or
- * nodes to be writeable:
- */
- closure_wake_up(&c->btree_interior_update_wait);
+ i = btree_bset_last(b);
+ i->journal_seq = cpu_to_le64(
+ max(res.seq,
+ le64_to_cpu(i->journal_seq)));
+
+ bch2_btree_add_journal_pin(c, b, res.seq);
+ six_unlock_write(&b->lock);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
* b->write_blocked prevented it from being written, so
* write it now if it needs to be written:
*/
- bch2_btree_node_write_cond(c, b, true);
- six_unlock_read(&b->lock);
- continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
+ btree_node_write_if_need(c, b, SIX_LOCK_intent);
+ six_unlock_intent(&b->lock);
break;
case BTREE_INTERIOR_UPDATING_AS:
- /*
- * The btree node we originally updated has been freed and is
- * being rewritten - so we need to write anything here, we just
- * need to signal to that btree_update that it's ok to make the
- * new replacement node visible:
- */
- closure_put(&as->parent_as->cl);
-
- /*
- * and then we have to wait on that btree_update to finish:
- */
- closure_wait(&as->parent_as->wait, &as->cl);
+ BUG_ON(b);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
-
- continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
break;
- case BTREE_INTERIOR_UPDATING_ROOT:
- /* b is the new btree root: */
- b = READ_ONCE(as->b);
-
- if (!six_trylock_read(&b->lock)) {
- mutex_unlock(&c->btree_interior_update_lock);
- btree_node_lock_type(c, b, SIX_LOCK_read);
- six_unlock_read(&b->lock);
- mutex_lock(&c->btree_interior_update_lock);
- goto retry;
- }
-
- BUG_ON(c->btree_roots[b->btree_id].as != as);
- c->btree_roots[b->btree_id].as = NULL;
+ case BTREE_INTERIOR_UPDATING_ROOT: {
+ struct btree_root *r = &c->btree_roots[as->btree_id];
- bch2_btree_set_root_ondisk(c, b, WRITE);
+ BUG_ON(b);
- /*
- * We don't have to wait anything anything here (before
- * btree_update_nodes_reachable frees the old nodes
- * ondisk) - we've ensured that the very next journal write will
- * have the pointer to the new root, and before the allocator
- * can reuse the old nodes it'll have to do a journal commit:
- */
- six_unlock_read(&b->lock);
+ mutex_lock(&c->btree_root_lock);
+ bkey_copy(&r->key, as->parent_keys.keys);
+ r->level = as->level;
+ r->alive = true;
+ c->btree_roots_dirty = true;
+ mutex_unlock(&c->btree_root_lock);
list_del(&as->unwritten_list);
mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * Bit of funny circularity going on here we have to break:
- *
- * We have to drop our journal pin before writing the journal
- * entry that points to the new btree root: else, we could
- * deadlock if the journal currently happens to be full.
- *
- * This mean we're dropping the journal pin _before_ the new
- * nodes are technically reachable - but this is safe, because
- * after the bch2_btree_set_root_ondisk() call above they will
- * be reachable as of the very next journal write:
- */
- bch2_journal_pin_drop(&c->journal, &as->journal);
-
- as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
- btree_update_wait_on_journal(&as->cl);
break;
}
+ }
+ bch2_journal_pin_drop(&c->journal, &as->journal);
+
+ bch2_journal_res_put(&c->journal, &res);
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+ btree_update_nodes_reachable(as, res.seq);
+free_update:
+ bch2_btree_update_free(as);
+ /*
+ * for flush_held_btree_writes() waiting on updates to flush or
+ * nodes to be writeable:
+ */
+ closure_wake_up(&c->btree_interior_update_wait);
+out:
mutex_lock(&c->btree_interior_update_lock);
- goto retry;
+ goto again;
}
/*
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
BUG_ON(!btree_node_dirty(b));
- as->mode = BTREE_INTERIOR_UPDATING_NODE;
- as->b = b;
+ as->mode = BTREE_INTERIOR_UPDATING_NODE;
+ as->b = b;
+ as->level = b->level;
list_add(&as->write_blocked_list, &b->write_blocked);
mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * In general, when you're staging things in a journal that will later
- * be written elsewhere, and you also want to guarantee ordering: that
- * is, if you have updates a, b, c, after a crash you should never see c
- * and not a or b - there's a problem:
- *
- * If the final destination of the update(s) (i.e. btree node) can be
- * written/flushed _before_ the relevant journal entry - oops, that
- * breaks ordering, since the various leaf nodes can be written in any
- * order.
- *
- * Normally we use bset->journal_seq to deal with this - if during
- * recovery we find a btree node write that's newer than the newest
- * journal entry, we just ignore it - we don't need it, anything we're
- * supposed to have (that we reported as completed via fsync()) will
- * still be in the journal, and as far as the state of the journal is
- * concerned that btree node write never happened.
- *
- * That breaks when we're rewriting/splitting/merging nodes, since we're
- * mixing btree node writes that haven't happened yet with previously
- * written data that has been reported as completed to the journal.
- *
- * Thus, before making the new nodes reachable, we have to wait the
- * newest journal sequence number we have data for to be written (if it
- * hasn't been yet).
- */
- bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
- struct journal_entry_pin *pin, u64 seq)
-{
- struct btree_update *as =
- container_of(pin, struct btree_update, journal);
-
- bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
}
static void btree_update_reparent(struct btree_update *as,
{
struct bch_fs *c = as->c;
+ lockdep_assert_held(&c->btree_interior_update_lock);
+
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- child->parent_as = as;
- closure_get(&as->cl);
/*
* When we write a new btree root, we have to drop our journal pin
* just transfer the journal pin to the new interior update so
* btree_update_nodes_written() can drop it.
*/
- bch2_journal_pin_copy(&c->journal, &as->journal,
- &child->journal, interior_update_flush);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
bch2_journal_pin_drop(&c->journal, &child->journal);
-
- as->journal_seq = max(as->journal_seq, child->journal_seq);
}
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
- struct btree_root *r = &c->btree_roots[as->btree_id];
-
- mutex_lock(&c->btree_interior_update_lock);
- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+ BUG_ON(!bch2_keylist_empty(&as->parent_keys));
- /*
- * Old root might not be persistent yet - if so, redirect its
- * btree_update operation to point to us:
- */
- if (r->as)
- btree_update_reparent(as, r->as);
-
- as->mode = BTREE_INTERIOR_UPDATING_ROOT;
- as->b = r->b;
- r->as = as;
+ mutex_lock(&c->btree_interior_update_lock);
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+ as->mode = BTREE_INTERIOR_UPDATING_ROOT;
+ as->level = b->level;
+ bch2_keylist_add(&as->parent_keys, &b->key);
mutex_unlock(&c->btree_interior_update_lock);
-
- /*
- * When we're rewriting nodes and updating interior nodes, there's an
- * issue with updates that haven't been written in the journal getting
- * mixed together with older data - see btree_update_updated_node()
- * for the explanation.
- *
- * However, this doesn't affect us when we're writing a new btree root -
- * because to make that new root reachable we have to write out a new
- * journal entry, which must necessarily be newer than as->journal_seq.
- */
}
static void btree_node_will_make_reachable(struct btree_update *as,
struct btree *b)
{
struct bch_fs *c = as->c;
- struct closure *cl, *cl_n;
struct btree_update *p, *n;
struct btree_write *w;
- struct bset_tree *t;
set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b);
- /*
- * Does this node have data that hasn't been written in the journal?
- *
- * If so, we have to wait for the corresponding journal entry to be
- * written before making the new nodes reachable - we can't just carry
- * over the bset->journal_seq tracking, since we'll be mixing those keys
- * in with keys that aren't in the journal anymore:
- */
- for_each_bset(b, t)
- as->journal_seq = max(as->journal_seq,
- le64_to_cpu(bset(b, t)->journal_seq));
-
mutex_lock(&c->btree_interior_update_lock);
/*
clear_btree_node_dirty(b);
clear_btree_node_need_write(b);
- w = btree_current_write(b);
-
- /*
- * Does this node have any btree_update operations waiting on this node
- * to be written?
- *
- * If so, wake them up when this btree_update operation is reachable:
- */
- llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
- llist_add(&cl->list, &as->wait.list);
/*
* Does this node have unwritten data that has a pin on the journal?
* oldest pin of any of the nodes we're freeing. We'll release the pin
* when the new nodes are persistent and reachable on disk:
*/
- bch2_journal_pin_copy(&c->journal, &as->journal,
- &w->journal, interior_update_flush);
+ w = btree_current_write(b);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal,
- &w->journal, interior_update_flush);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
{
struct btree_reserve *reserve;
struct btree_update *as;
+ int ret;
reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
if (IS_ERR(reserve))
bch2_keylist_init(&as->parent_keys, as->inline_keys);
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+ jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+ if (ret) {
+ bch2_btree_reserve_put(c, reserve);
+ closure_debug_destroy(&as->cl);
+ mempool_free(as, &c->btree_interior_update_pool);
+ return ERR_PTR(ret);
+ }
+
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
mutex_unlock(&c->btree_interior_update_lock);
}
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
- struct btree_root *r = &c->btree_roots[b->btree_id];
-
- mutex_lock(&c->btree_root_lock);
-
- BUG_ON(b != r->b);
- bkey_copy(&r->key, &b->key);
- r->level = b->level;
- r->alive = true;
- if (rw == WRITE)
- c->btree_roots_dirty = true;
-
- mutex_unlock(&c->btree_root_lock);
-}
-
/**
* bch_btree_set_root - update the root in memory and on disk
*
bch2_btree_set_root_inmem(as, b);
- btree_update_updated_root(as);
+ btree_update_updated_root(as, b);
/*
* Unlock old root after new root is visible:
bch2_btree_build_aux_trees(n1);
six_unlock_write(&n1->lock);
- bch2_keylist_add(&as->parent_keys, &n1->key);
+ if (parent)
+ bch2_keylist_add(&as->parent_keys, &n1->key);
}
bch2_btree_node_write(c, n1, SIX_LOCK_intent);
(bkey_cmp_packed(b, k, &insert->k) >= 0))
;
- while (!bch2_keylist_empty(keys)) {
- insert = bch2_keylist_front(keys);
-
+ for_each_keylist_key(keys, insert)
bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
- bch2_keylist_pop_front(keys);
- }
btree_update_updated_node(as, b);
unsigned flags)
{
struct btree_trans *trans = iter->trans;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
struct btree_update *as;
struct closure cl;
int ret = 0;
bkey_copy(&b->key, new_key);
}
- btree_update_updated_root(as);
+ btree_update_updated_root(as, b);
bch2_btree_node_unlock_write(b, iter);
}
unsigned nodes_written:1;
enum btree_id btree_id;
+ u8 level;
struct btree_reserve *reserve;
+ struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:
struct btree *b;
struct list_head write_blocked_list;
- /*
- * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
- * we're now blocking another btree_update
- * @parent_as - btree_update that's waiting on our nodes to finish
- * writing, before it can make new nodes visible on disk
- * @wait - list of child btree_updates that are waiting on this
- * btree_update to make all the new nodes visible before they can free
- * their old btree nodes
- */
- struct btree_update *parent_as;
- struct closure_waitlist wait;
-
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
*/
struct journal_entry_pin journal;
- u64 journal_seq;
-
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
struct btree_insert_entry *i)
{
return i != trans->updates2 &&
- i[0].iter->l[0].b == i[-1].iter->l[0].b;
+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
}
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
return __btree_node_flush(j, pin, 1, seq);
}
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+ struct btree *b, u64 seq)
+{
+ struct btree_write *w = btree_current_write(b);
+
+ bch2_journal_pin_add(&c->journal, seq, &w->journal,
+ btree_node_write_idx(b) == 0
+ ? btree_node_flush0
+ : btree_node_flush1);
+}
+
static inline void __btree_journal_key(struct btree_trans *trans,
enum btree_id btree_id,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
- struct btree *b = iter->l[0].b;
- struct btree_write *w = btree_current_write(b);
- u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
- ? trans->journal_res.seq
- : j->replay_journal_seq;
+ struct btree *b = iter_l(iter)->b;
- EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
cpu_to_le64(trans->journal_res.seq);
}
- bch2_journal_pin_add(j, seq, &w->journal,
- btree_node_write_idx(b) == 0
- ? btree_node_flush0
- : btree_node_flush1);
+ bch2_btree_add_journal_pin(c, b,
+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ ? trans->journal_res.seq
+ : j->replay_journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
struct bset_tree *t = bset_tree_last(b);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
- EBUG_ON(iter->level);
-
insert->k.needs_whiteout = false;
- if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+ if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
bch2_btree_journal_key(trans, iter, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
{
struct bch_fs *c = trans->c;
- BUG_ON(iter->level);
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
BUG_ON(debug_check_bkeys(c) &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
unsigned *u64s)
{
struct bch_fs *c = trans->c;
- struct btree *b = iter->l[0].b;
+ struct btree *b = iter_l(iter)->b;
static enum btree_insert_ret ret;
if (unlikely(btree_node_fake(b)))
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
- if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+ if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
bch2_mark_update(trans, i->iter, i->k, NULL,
i->trigger_flags|BTREE_TRIGGER_GC);
}
int ret;
trans_for_each_update2(trans, i)
- BUG_ON(!btree_node_intent_locked(i->iter, 0));
+ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
ret = bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_lock_for_insert(trans->c,
- i->iter->l[0].b, i->iter);
+ iter_l(i->iter)->b, i->iter);
ret = bch2_trans_commit_write_locked(trans, stopped_at);
trans_for_each_update2(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
i->iter);
/*
* extent we're inserting and overwriting:
*/
*nr_iters += 1;
+ if (*nr_iters >= max_iters) {
+ *end = bpos_min(*end, k.k->p);
+ ret = 1;
+ }
switch (k.k->type) {
case KEY_TYPE_extent:
bch_verbose(c, "checking extents");
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
- POS(BCACHEFS_ROOT_INO, 0), 0);
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_INTENT);
retry:
for_each_btree_key_continue(iter, 0, k, ret) {
if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
/* iterate over keys read from the journal: */
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
- while (iter->k) {
- if (iter->k->btree_id == iter->btree_id)
- return bkey_i_to_s_c(iter->k->k);
+ size_t l = 0, r = journal_keys->nr, m;
- iter->k++;
- if (iter->k == iter->keys->d + iter->keys->nr)
- iter->k = NULL;
+ while (l < r) {
+ m = l + ((r - l) >> 1);
+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
+ cmp_int(level, journal_keys->d[m].level) ?:
+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+ l = m + 1;
+ else
+ r = m;
}
- return bkey_s_c_null;
+ BUG_ON(l < journal_keys->nr &&
+ (cmp_int(id, journal_keys->d[l].btree_id) ?:
+ cmp_int(level, journal_keys->d[l].level) ?:
+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+
+ BUG_ON(l &&
+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
+ cmp_int(level, journal_keys->d[l - 1].level) ?:
+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+
+ return l < journal_keys->nr ? journal_keys->d + l : NULL;
}
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- if (!iter->k)
- return bkey_s_c_null;
+ if (iter->k &&
+ iter->k < iter->keys->d + iter->keys->nr &&
+ iter->k->btree_id == iter->btree_id &&
+ iter->k->level == iter->level)
+ return iter->k->k;
- iter->k++;
- if (iter->k == iter->keys->d + iter->keys->nr)
- iter->k = NULL;
+ iter->k = NULL;
+ return NULL;
+}
- return bch2_journal_iter_peek(iter);
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+ if (iter->k)
+ iter->k++;
+}
+
+static void bch2_journal_iter_init(struct journal_iter *iter,
+ struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
+{
+ iter->btree_id = id;
+ iter->level = level;
+ iter->keys = journal_keys;
+ iter->k = journal_key_search(journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+ return iter->btree
+ ? bch2_btree_iter_peek(iter->btree)
+ : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+ iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+ if (iter->btree)
+ bch2_btree_iter_next(iter->btree);
+ else
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
}
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
case none:
break;
case btree:
- bch2_btree_iter_next(iter->btree);
+ bch2_journal_iter_advance_btree(iter);
break;
case journal:
- bch2_journal_iter_next(&iter->journal);
+ bch2_journal_iter_advance(&iter->journal);
break;
}
struct bkey_s_c ret;
while (1) {
- struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree);
- struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal);
+ struct bkey_s_c btree_k =
+ bch2_journal_iter_peek_btree(iter);
+ struct bkey_s_c journal_k =
+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
if (btree_k.k && journal_k.k) {
int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
if (!cmp)
- bch2_btree_iter_next(iter->btree);
+ bch2_journal_iter_advance_btree(iter);
iter->last = cmp < 0 ? btree : journal;
} else if (btree_k.k) {
}
ret = iter->last == journal ? journal_k : btree_k;
+
+ if (iter->b &&
+ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+ iter->journal.k = NULL;
+ iter->last = none;
+ return bkey_s_c_null;
+ }
+
if (!bkey_deleted(ret.k))
break;
return bch2_btree_and_journal_iter_peek(iter);
}
-struct journal_key *journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, struct bpos pos)
-{
- size_t l = 0, r = journal_keys->nr, m;
-
- while (l < r) {
- m = l + ((r - l) >> 1);
- if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
- bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
- l = m + 1;
- else
- r = m;
- }
-
- BUG_ON(l < journal_keys->nr &&
- (cmp_int(id, journal_keys->d[l].btree_id) ?:
- bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
-
- BUG_ON(l &&
- (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
- bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
-
- return l < journal_keys->nr ? journal_keys->d + l : NULL;
-}
-
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
struct btree_trans *trans,
struct journal_keys *journal_keys,
enum btree_id id, struct bpos pos)
{
- iter->journal.keys = journal_keys;
- iter->journal.k = journal_key_search(journal_keys, id, pos);
- iter->journal.btree_id = id;
+ memset(iter, 0, sizeof(*iter));
iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+}
+
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct journal_keys *journal_keys,
+ struct btree *b)
+{
+ struct bpos start = b->data->min_key;
+
+ if (btree_node_type_is_extents(b->btree_id))
+ start = bkey_successor(start);
+
+ memset(iter, 0, sizeof(*iter));
+
+ iter->b = b;
+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
+ bch2_journal_iter_init(&iter->journal, journal_keys,
+ b->btree_id, b->level, start);
}
/* sort and dedup all keys in the journal: */
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->level, r->level) ?:
bkey_cmp(l->k->k.p, r->k->k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->btree_id, r->btree_id) ?:
- bkey_cmp(l->k->k.p, r->k->k.p);
+ return cmp_int(r->level, l->level) ?:
+ cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->k->k.p, r->k->k.p);
}
static void journal_keys_free(struct journal_keys *keys)
for_each_jset_key(k, _n, entry, &p->j)
keys.d[keys.nr++] = (struct journal_key) {
.btree_id = entry->btree_id,
+ .level = entry->level,
.k = k,
.journal_seq = le64_to_cpu(p->j.seq) -
keys.journal_seq_base,
src = dst = keys.d;
while (src < keys.d + keys.nr) {
while (src + 1 < keys.d + keys.nr &&
- src[0].btree_id == src[1].btree_id &&
+ src[0].btree_id == src[1].btree_id &&
+ src[0].level == src[1].level &&
!bkey_cmp(src[0].k->k.p, src[1].k->k.p))
src++;
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
- enum btree_id id, struct bkey_i *k)
+ enum btree_id id, unsigned level,
+ struct bkey_i *k)
{
struct btree_iter *iter;
int ret;
- iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
+ iter = bch2_trans_get_node_iter(trans, id, k->k.p,
+ BTREE_MAX_DEPTH, level,
+ BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
}
static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
- struct bkey_i *k)
+ unsigned level, struct bkey_i *k)
{
return bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY,
- __bch2_journal_replay_key(&trans, id, k));
+ __bch2_journal_replay_key(&trans, id, level, k));
}
static int bch2_journal_replay(struct bch_fs *c,
sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+ replay_now_at(j, keys.journal_seq_base);
+
for_each_journal_key(keys, i) {
- replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+ if (!i->level)
+ replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+ if (i->level)
+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (i->btree_id == BTREE_ID_ALLOC)
ret = bch2_alloc_replay_key(c, i->k);
else if (i->k->k.size)
ret = bch2_extent_replay_key(c, i->btree_id, i->k);
else
- ret = bch2_journal_replay_key(c, i->btree_id, i->k);
+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
if (ret) {
bch_err(c, "journal replay: error %d while replaying key",
*/
bch_info(c, "starting metadata mark and sweep");
err = "error in mark and sweep";
- ret = bch2_gc(c, NULL, true, true);
+ ret = bch2_gc(c, &journal_keys, true, true);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
struct journal_keys {
struct journal_key {
enum btree_id btree_id:8;
+ unsigned level:8;
struct bkey_i *k;
u32 journal_seq;
u32 journal_offset;
for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
struct journal_iter {
+ enum btree_id btree_id;
+ unsigned level;
struct journal_keys *keys;
struct journal_key *k;
- enum btree_id btree_id;
};
-struct btree_and_journal_iter {
- enum btree_id btree_id;
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+struct btree_and_journal_iter {
struct btree_iter *btree;
+
+ struct btree *b;
+ struct btree_node_iter node_iter;
+ struct bkey unpacked;
+
struct journal_iter journal;
enum last_key_returned {
void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-struct journal_key *journal_key_search(struct journal_keys *,
- enum btree_id, struct bpos);
+
void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
struct btree_trans *,
struct journal_keys *,
enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct journal_keys *,
+ struct btree *);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;