Update bcachefs sources to 6628827a8707 bcachefs: Skip deleted members in member_to_t...

[bcachefs-tools-debian] / libbcachefs / btree_write_buffer.c
diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c

index 026c249a3f441c9073aaa2641a9fec9290b50baf..4e6241db518b59d62c551e3d6d9c2541fd87737a 100644 (file)
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@@ -75,9 +75,11 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
         }
         return 0;
  trans_commit:
-       return  bch2_trans_update(trans, iter, &wb->k, 0) ?:
+       return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
+                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                 bch2_trans_commit(trans, NULL, NULL,
                                   commit_flags|
+                                 BTREE_INSERT_NOCHECK_RW|
                                   BTREE_INSERT_NOFAIL|
                                   BTREE_INSERT_JOURNAL_RECLAIM);
  }
@@ -102,6 +104,33 @@ static union btree_write_buffer_state btree_write_buffer_switch(struct btree_wri
         return old;
  }
  
+/*
+ * Update a btree with a write buffered key using the journal seq of the
+ * original write buffer insert.
+ *
+ * It is not safe to rejournal the key once it has been inserted into the write
+ * buffer because that may break recovery ordering. For example, the key may
+ * have already been modified in the active write buffer in a seq that comes
+ * before the current transaction. If we were to journal this key again and
+ * crash, recovery would process updates in the wrong order.
+ */
+static int
+btree_write_buffered_insert(struct btree_trans *trans,
+                         struct btree_write_buffered_key *wb)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
+                            BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
+                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
  int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
                                     bool locked)
  {
@@ -109,9 +138,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         struct journal *j = &c->journal;
         struct btree_write_buffer *wb = &c->btree_write_buffer;
         struct journal_entry_pin pin;
-       struct btree_write_buffered_key *i, *dst, *keys;
+       struct btree_write_buffered_key *i, *keys;
         struct btree_iter iter = { NULL };
-       size_t nr = 0, skipped = 0, fast = 0;
+       size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
         bool write_locked = false;
         union btree_write_buffer_state s;
         int ret = 0;
@@ -128,6 +157,9 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         keys = wb->keys[s.idx];
         nr = s.nr;
  
+       if (race_fault())
+               goto slowpath;
+
         /*
          * We first sort so that we can detect and skip redundant updates, and
          * then we attempt to flush in sorted btree order, as this is most
@@ -135,15 +167,13 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
          *
          * However, since we're not flushing in the order they appear in the
          * journal we won't be able to drop our journal pin until everything is
-        * flushed - which means this could deadlock the journal, if we weren't
-        * passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
+        * flushed - which means this could deadlock the journal if we weren't
+        * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
          * if it would block taking a journal reservation.
          *
-        * If that happens, we sort them by the order they appeared in the
-        * journal - after dropping redundant entries - and then restart
-        * flushing, this time dropping journal pins as we go.
+        * If that happens, simply skip the key so we can optimistically insert
+        * as many keys as possible in the fast path.
          */
-
         sort(keys, nr, sizeof(keys[0]),
              btree_write_buffered_key_cmp, NULL);
  
@@ -152,6 +182,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
                     i[0].btree == i[1].btree &&
                     bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
                         skipped++;
+                       i->journal_seq = 0;
                         continue;
                 }
  
@@ -164,7 +195,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
  
                 if (!iter.path || iter.path->btree_id != i->btree) {
                         bch2_trans_iter_exit(trans, &iter);
-                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+                                            BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
                 }
  
                 bch2_btree_iter_set_pos(&iter, i->k.k.p);
@@ -177,8 +209,14 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
                                 bch2_trans_begin(trans);
                 } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
  
+               if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+                       slowpath++;
+                       continue;
+               }
                 if (ret)
                         break;
+
+               i->journal_seq = 0;
         }
  
         if (write_locked)
@@ -187,7 +225,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
  
         trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
  
-       if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+       if (slowpath)
                 goto slowpath;
  
         bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
@@ -198,23 +236,22 @@ out:
  slowpath:
         trace_write_buffer_flush_slowpath(trans, i - keys, nr);
  
-       dst = keys;
-       for (; i < keys + nr; i++) {
-               if (i + 1 < keys + nr &&
-                   i[0].btree == i[1].btree &&
-                   bpos_eq(i[0].k.k.p, i[1].k.k.p))
-                       continue;
-
-               *dst = *i;
-               dst++;
-       }
-       nr = dst - keys;
-
+       /*
+        * Now sort the rest by journal seq and bump the journal pin as we go.
+        * The slowpath zapped the seq of keys that were successfully flushed so
+        * we can skip those here.
+        */
         sort(keys, nr, sizeof(keys[0]),
              btree_write_buffered_journal_cmp,
              NULL);
  
+       commit_flags &= ~BCH_WATERMARK_MASK;
+       commit_flags |= BCH_WATERMARK_reclaim;
+
         for (i = keys; i < keys + nr; i++) {
+               if (!i->journal_seq)
+                       continue;
+
                 if (i->journal_seq > pin.seq) {
                         struct journal_entry_pin pin2;
  
@@ -229,9 +266,8 @@ slowpath:
                 ret = commit_do(trans, NULL, NULL,
                                 commit_flags|
                                 BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_JOURNAL_RECLAIM|
-                               JOURNAL_WATERMARK_reserved,
-                               __bch2_btree_insert(trans, i->btree, &i->k, 0));
+                               BTREE_INSERT_JOURNAL_RECLAIM,
+                               btree_write_buffered_insert(trans, i));
                 if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
                         break;
         }
@@ -260,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
         mutex_lock(&wb->flush_lock);
  
         return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+                       __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
  }
  
  static inline u64 btree_write_buffer_ref(int idx)
@@ -333,7 +369,7 @@ int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
         wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
         wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
         if (!wb->keys[0] || !wb->keys[1])
-               return -ENOMEM;
+               return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
  
         return 0;
  }