}
return 0;
trans_commit:
- return bch2_trans_update(trans, iter, &wb->k, 0) ?:
+ return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
+ BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RECLAIM);
}
return old;
}
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+ struct btree_write_buffered_key *wb)
+{
+ struct btree_iter iter;
+ int ret;
+
+ bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+ ret = bch2_btree_iter_traverse(&iter) ?:
+ bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
bool locked)
{
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct journal_entry_pin pin;
- struct btree_write_buffered_key *i, *dst, *keys;
+ struct btree_write_buffered_key *i, *keys;
struct btree_iter iter = { NULL };
- size_t nr = 0, skipped = 0, fast = 0;
+ size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
union btree_write_buffer_state s;
int ret = 0;
keys = wb->keys[s.idx];
nr = s.nr;
+ if (race_fault())
+ goto slowpath;
+
/*
* We first sort so that we can detect and skip redundant updates, and
* then we attempt to flush in sorted btree order, as this is most
*
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
- * flushed - which means this could deadlock the journal, if we weren't
- * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+ * flushed - which means this could deadlock the journal if we weren't
+ * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
* if it would block taking a journal reservation.
*
- * If that happens, we sort them by the order they appeared in the
- * journal - after dropping redundant entries - and then restart
- * flushing, this time dropping journal pins as we go.
+ * If that happens, simply skip the key so we can optimistically insert
+ * as many keys as possible in the fast path.
*/
-
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_key_cmp, NULL);
i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
+ i->journal_seq = 0;
continue;
}
if (!iter.path || iter.path->btree_id != i->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
bch2_btree_iter_set_pos(&iter, i->k.k.p);
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ slowpath++;
+ continue;
+ }
if (ret)
break;
+
+ i->journal_seq = 0;
}
if (write_locked)
trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+ if (slowpath)
goto slowpath;
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
slowpath:
trace_write_buffer_flush_slowpath(trans, i - keys, nr);
- dst = keys;
- for (; i < keys + nr; i++) {
- if (i + 1 < keys + nr &&
- i[0].btree == i[1].btree &&
- bpos_eq(i[0].k.k.p, i[1].k.k.p))
- continue;
-
- *dst = *i;
- dst++;
- }
- nr = dst - keys;
-
+ /*
+ * Now sort the rest by journal seq and bump the journal pin as we go.
+ * The slowpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_journal_cmp,
NULL);
+ commit_flags &= ~BCH_WATERMARK_MASK;
+ commit_flags |= BCH_WATERMARK_reclaim;
+
for (i = keys; i < keys + nr; i++) {
+ if (!i->journal_seq)
+ continue;
+
if (i->journal_seq > pin.seq) {
struct journal_entry_pin pin2;
ret = commit_do(trans, NULL, NULL,
commit_flags|
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM|
- JOURNAL_WATERMARK_reserved,
- __bch2_btree_insert(trans, i->btree, &i->k, 0));
+ BTREE_INSERT_JOURNAL_RECLAIM,
+ btree_write_buffered_insert(trans, i));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break;
}
mutex_lock(&wb->flush_lock);
return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+ __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
}
static inline u64 btree_write_buffer_ref(int idx)
wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
if (!wb->keys[0] || !wb->keys[1])
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
return 0;
}