X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_write_buffer.c;h=a6bf6ed37ced60cfee4bb61c15c47c06d5ace9c7;hb=7fd6c3ffe45b3b42c0bc8a8c5d1387a5e3316a54;hp=05b755a0e79ca21edf751778fc4c10e1b6472424;hpb=e160e9b97986d908bce40ab40ee5d930453a3bf1;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 05b755a..a6bf6ed 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -11,6 +11,9 @@ #include +static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + static int btree_write_buffered_key_cmp(const void *_l, const void *_r) { const struct btree_write_buffered_key *l = _l; @@ -45,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, if (ret) return ret; + /* + * We can't clone a path that has write locks: unshare it now, before + * set_pos and traverse(): + */ + if (iter->path->ref > 1) + iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); + path = iter->path; if (!*write_locked) { @@ -66,11 +76,16 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, (*fast)++; return 0; trans_commit: - return bch2_trans_update(trans, iter, &wb->k, 0) ?: + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM); + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); } static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) @@ -88,9 +103,40 @@ static union btree_write_buffer_state btree_write_buffer_switch(struct btree_wri while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) cpu_relax(); + smp_mb(); + return old; } +/* + * Update a btree with a write buffered key using the journal seq of the + * original write buffer insert. + * + * It is not safe to rejournal the key once it has been inserted into the write + * buffer because that may break recovery ordering. For example, the key may + * have already been modified in the active write buffer in a seq that comes + * before the current transaction. If we were to journal this key again and + * crash, recovery would process updates in the wrong order. + */ +static int +btree_write_buffered_insert(struct btree_trans *trans, + struct btree_write_buffered_key *wb) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + + trans->journal_res.seq = wb->journal_seq; + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, bool locked) { @@ -98,9 +144,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *dst, *keys; + struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; union btree_write_buffer_state s; int ret = 0; @@ -110,13 +156,17 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f if (!locked && !mutex_trylock(&wb->flush_lock)) return 0; - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); bch2_journal_pin_drop(j, &wb->journal_pin); s = btree_write_buffer_switch(wb); keys = wb->keys[s.idx]; nr = s.nr; + if (race_fault()) + goto slowpath; + /* * We first sort so that we can detect and skip redundant updates, and * then we attempt to flush in sorted btree order, as this is most @@ -124,15 +174,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f * * However, since we're not flushing in the order they appear in the * journal we won't be able to drop our journal pin until everything is - * flushed - which means this could deadlock the journal, if we weren't - * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail + * flushed - which means this could deadlock the journal if we weren't + * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail * if it would block taking a journal reservation. * - * If that happens, we sort them by the order they appeared in the - * journal - after dropping redundant entries - and then restart - * flushing, this time dropping journal pins as we go. + * If that happens, simply skip the key so we can optimistically insert + * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), btree_write_buffered_key_cmp, NULL); @@ -141,6 +189,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f i[0].btree == i[1].btree && bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; + i->journal_seq = 0; continue; } @@ -153,7 +202,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f if (!iter.path || iter.path->btree_id != i->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } bch2_btree_iter_set_pos(&iter, i->k.k.p); @@ -166,8 +216,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + slowpath++; + continue; + } if (ret) break; + + i->journal_seq = 0; } if (write_locked) @@ -176,7 +232,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - if (ret == -BCH_ERR_journal_reclaim_would_deadlock) + if (slowpath) goto slowpath; bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); @@ -187,40 +243,31 @@ out: slowpath: trace_write_buffer_flush_slowpath(trans, i - keys, nr); - dst = keys; - for (; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) - continue; - - *dst = *i; - dst++; - } - nr = dst - keys; - + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ sort(keys, nr, sizeof(keys[0]), btree_write_buffered_journal_cmp, NULL); - for (i = keys; i < keys + nr; i++) { - if (i->journal_seq > pin.seq) { - struct journal_entry_pin pin2; + commit_flags &= ~BCH_WATERMARK_MASK; + commit_flags |= BCH_WATERMARK_reclaim; - memset(&pin2, 0, sizeof(pin2)); + for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; - bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); - bch2_journal_pin_drop(j, &pin); - bch2_journal_pin_copy(j, &pin, &pin2, NULL); - bch2_journal_pin_drop(j, &pin2); - } + bch2_journal_pin_update(j, i->journal_seq, &pin, + bch2_btree_write_buffer_journal_flush); ret = commit_do(trans, NULL, NULL, commit_flags| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RECLAIM| - JOURNAL_WATERMARK_reserved, - __bch2_btree_insert(trans, i->btree, &i->k, 0)); + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) break; } @@ -249,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, mutex_lock(&wb->flush_lock); return bch2_trans_run(c, - __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); + __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true)); } static inline u64 btree_write_buffer_ref(int idx) @@ -322,7 +369,7 @@ int bch2_fs_btree_write_buffer_init(struct bch_fs *c) wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); if (!wb->keys[0] || !wb->keys[1]) - return -ENOMEM; + return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; return 0; }