]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_write_buffer.c
Update bcachefs sources to 6628827a8707 bcachefs: Skip deleted members in member_to_t...
[bcachefs-tools-debian] / libbcachefs / btree_write_buffer.c
index 026c249a3f441c9073aaa2641a9fec9290b50baf..4e6241db518b59d62c551e3d6d9c2541fd87737a 100644 (file)
@@ -75,9 +75,11 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
        }
        return 0;
 trans_commit:
-       return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
+       return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  commit_flags|
+                                 BTREE_INSERT_NOCHECK_RW|
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_JOURNAL_RECLAIM);
 }
@@ -102,6 +104,33 @@ static union btree_write_buffer_state btree_write_buffer_switch(struct btree_wri
        return old;
 }
 
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+                         struct btree_write_buffered_key *wb)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+                            BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
 int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
                                    bool locked)
 {
@@ -109,9 +138,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
        struct journal *j = &c->journal;
        struct btree_write_buffer *wb = &c->btree_write_buffer;
        struct journal_entry_pin pin;
-       struct btree_write_buffered_key *i, *dst, *keys;
+       struct btree_write_buffered_key *i, *keys;
        struct btree_iter iter = { NULL };
-       size_t nr = 0, skipped = 0, fast = 0;
+       size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
        bool write_locked = false;
        union btree_write_buffer_state s;
        int ret = 0;
@@ -128,6 +157,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
        keys = wb->keys[s.idx];
        nr = s.nr;
 
+       if (race_fault())
+               goto slowpath;
+
        /*
         * We first sort so that we can detect and skip redundant updates, and
         * then we attempt to flush in sorted btree order, as this is most
@@ -135,15 +167,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         *
         * However, since we're not flushing in the order they appear in the
         * journal we won't be able to drop our journal pin until everything is
-        * flushed - which means this could deadlock the journal, if we weren't
-        * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+        * flushed - which means this could deadlock the journal if we weren't
+        * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
         * if it would block taking a journal reservation.
         *
-        * If that happens, we sort them by the order they appeared in the
-        * journal - after dropping redundant entries - and then restart
-        * flushing, this time dropping journal pins as we go.
+        * If that happens, simply skip the key so we can optimistically insert
+        * as many keys as possible in the fast path.
         */
-
        sort(keys, nr, sizeof(keys[0]),
             btree_write_buffered_key_cmp, NULL);
 
@@ -152,6 +182,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
                    i[0].btree == i[1].btree &&
                    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
                        skipped++;
+                       i->journal_seq = 0;
                        continue;
                }
 
@@ -164,7 +195,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 
                if (!iter.path || iter.path->btree_id != i->btree) {
                        bch2_trans_iter_exit(trans, &iter);
-                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+                                            BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
                }
 
                bch2_btree_iter_set_pos(&iter, i->k.k.p);
@@ -177,8 +209,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
                                bch2_trans_begin(trans);
                } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
+               if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+                       slowpath++;
+                       continue;
+               }
                if (ret)
                        break;
+
+               i->journal_seq = 0;
        }
 
        if (write_locked)
@@ -187,7 +225,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 
        trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
 
-       if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+       if (slowpath)
                goto slowpath;
 
        bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
@@ -198,23 +236,22 @@ out:
 slowpath:
        trace_write_buffer_flush_slowpath(trans, i - keys, nr);
 
-       dst = keys;
-       for (; i < keys + nr; i++) {
-               if (i + 1 < keys + nr &&
-                   i[0].btree == i[1].btree &&
-                   bpos_eq(i[0].k.k.p, i[1].k.k.p))
-                       continue;
-
-               *dst = *i;
-               dst++;
-       }
-       nr = dst - keys;
-
+       /*
+        * Now sort the rest by journal seq and bump the journal pin as we go.
+        * The slowpath zapped the seq of keys that were successfully flushed so
+        * we can skip those here.
+        */
        sort(keys, nr, sizeof(keys[0]),
             btree_write_buffered_journal_cmp,
             NULL);
 
+       commit_flags &= ~BCH_WATERMARK_MASK;
+       commit_flags |= BCH_WATERMARK_reclaim;
+
        for (i = keys; i < keys + nr; i++) {
+               if (!i->journal_seq)
+                       continue;
+
                if (i->journal_seq > pin.seq) {
                        struct journal_entry_pin pin2;
 
@@ -229,9 +266,8 @@ slowpath:
                ret = commit_do(trans, NULL, NULL,
                                commit_flags|
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_JOURNAL_RECLAIM|
-                               JOURNAL_WATERMARK_reserved,
-                               __bch2_btree_insert(trans, i->btree, &i->k, 0));
+                               BTREE_INSERT_JOURNAL_RECLAIM,
+                               btree_write_buffered_insert(trans, i));
                if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
                        break;
        }
@@ -260,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
        mutex_lock(&wb->flush_lock);
 
        return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+                       __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
@@ -333,7 +369,7 @@ int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
        wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
        wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
        if (!wb->keys[0] || !wb->keys[1])
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
 
        return 0;
 }