]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 3ca08ab51ec9 bcachefs: six locks: Simplify optimistic...
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 13 Nov 2023 01:53:57 +0000 (20:53 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Mon, 13 Nov 2023 01:57:28 +0000 (20:57 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
65 files changed:
.bcachefs_revision
include/linux/kernel.h
include/linux/list.h
libbcachefs/alloc_background.c
libbcachefs/backpointers.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache_types.h [new file with mode: 0644]
libbcachefs/btree_trans_commit.c
libbcachefs/btree_types.h
libbcachefs/btree_update.c
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_write_buffer.c
libbcachefs/darray.c [new file with mode: 0644]
libbcachefs/darray.h
libbcachefs/data_update.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/disk_groups.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/errcode.h
libbcachefs/fs-io-pagecache.c
libbcachefs/fs-io-pagecache.h
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/io_misc.c
libbcachefs/io_read.c
libbcachefs/io_write.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/logged_ops.c
libbcachefs/lru.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/sb-clean.c
libbcachefs/sb-errors.c
libbcachefs/sb-members.c
libbcachefs/snapshot.c
libbcachefs/str_hash.h
libbcachefs/subvolume.c
libbcachefs/subvolume_types.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/trace.h
libbcachefs/util.c
libbcachefs/util.h

index 8abb4eddb0934fbd1b8bc0a51e006a4c7275cf1e..bf2104d1c09d5f763b25145d1aac31e6ec1098f8 100644 (file)
@@ -1 +1 @@
-d464ec667b2b9de097e39d1505b45aafd87a9552
+3ca08ab51ec996180c20105489176b8c4327240c
index 35a7207e0495c08d950a991abbebebdf03b9a782..f9a5712938101e176864214c5ef6480275d0303b 100644 (file)
@@ -278,4 +278,7 @@ static inline void dump_stack(void) {}
 #define unsafe_memcpy(dst, src, bytes, justification)          \
        memcpy(dst, src, bytes)
 
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
+       __DECLARE_FLEX_ARRAY(TYPE, NAME)
+
 #endif
index bdd09efa7968534149b7c47ec27d5983309424b8..d176d0d3485e5929f31a76b14ac5eb8008fa2fce 100644 (file)
@@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n)
             pos;                                                       \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
 
+static inline size_t list_count_nodes(struct list_head *head)
+{
+       struct list_head *pos;
+       size_t count = 0;
+
+       list_for_each(pos, head)
+               count++;
+
+       return count;
+}
+
 #endif /* _LIST_LIST_H */
index 1fec0e67891f120efefed775c8010bc1b6675a86..113273b214645ff5ac43f508ed2d168ccd1c1743 100644 (file)
@@ -561,8 +561,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
                if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
                        ret = commit_do(trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_LAZY_RW,
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_lazy_rw,
                                bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
                        if (ret)
                                break;
@@ -581,8 +581,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
        if (have_bucket_gens_key && !ret)
                ret = commit_do(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_LAZY_RW,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_lazy_rw,
                        bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
        bch2_trans_put(trans);
@@ -1267,7 +1267,7 @@ delete:
        ret =   bch2_btree_delete_extent_at(trans, iter,
                        iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                       BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+                       BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
        goto out;
 }
 
@@ -1422,8 +1422,8 @@ int bch2_check_alloc_info(struct bch_fs *c)
                }
 
                ret = bch2_trans_commit(trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_LAZY_RW);
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_lazy_rw);
                if (ret)
                        goto bkey_err;
 
@@ -1453,7 +1453,7 @@ bkey_err:
              for_each_btree_key_commit(trans, iter,
                        BTREE_ID_bucket_gens, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
                bch2_check_bucket_gens_key(trans, &iter, k));
 err:
        bch2_trans_put(trans);
@@ -1546,7 +1546,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
        ret = bch2_trans_run(c,
                for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
                        bch2_check_alloc_to_lru_ref(trans, &iter)));
        if (ret)
                bch_err_fn(c, ret);
@@ -1655,7 +1655,7 @@ write:
        ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BCH_WATERMARK_btree|
-                                 BTREE_INSERT_NOFAIL);
+                                 BCH_TRANS_COMMIT_no_enospc);
        if (ret)
                goto out;
 
@@ -1760,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
                                BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BCH_WATERMARK_btree|
-                                 BTREE_INSERT_NOFAIL);
+                                 BCH_TRANS_COMMIT_no_enospc);
        if (ret)
                goto out;
 
@@ -1884,8 +1884,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 
                        ret =   bch2_bucket_do_index(trans, k, a, true) ?:
                                bch2_trans_commit(trans, NULL, NULL,
-                                                 BTREE_INSERT_LAZY_RW|
-                                                 BTREE_INSERT_NOFAIL);
+                                                 BCH_TRANS_COMMIT_lazy_rw|
+                                                 BCH_TRANS_COMMIT_no_enospc);
                        if (ret)
                                goto bkey_err;
 
@@ -1905,8 +1905,8 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
 
                        ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
                                bch2_trans_commit(trans, NULL, NULL,
-                                                 BTREE_INSERT_LAZY_RW|
-                                                 BTREE_INSERT_NOFAIL);
+                                                 BCH_TRANS_COMMIT_lazy_rw|
+                                                 BCH_TRANS_COMMIT_no_enospc);
                        if (ret)
                                goto bkey_err;
 
index 5ed96dddae08fee8c579919cf6db2061546a5b20..5025a71ad6851709957f1d27bf3e46a3b66d962b 100644 (file)
@@ -5,6 +5,7 @@
 #include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_write_buffer.h"
 #include "error.h"
 
@@ -220,18 +221,22 @@ out:
 static void backpointer_not_found(struct btree_trans *trans,
                                  struct bpos bp_pos,
                                  struct bch_backpointer bp,
-                                 struct bkey_s_c k,
-                                 const char *thing_it_points_to)
+                                 struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        struct printbuf buf = PRINTBUF;
        struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 
+       /*
+        * If we're using the btree write buffer, the backpointer we were
+        * looking at may have already been deleted - failure to find what it
+        * pointed to is not an error:
+        */
        if (likely(!bch2_backpointers_no_use_write_buffer))
                return;
 
        prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
-                  thing_it_points_to);
+                  bp.level ? "btree node" : "extent");
        prt_printf(&buf, "bucket: ");
        bch2_bpos_to_text(&buf, bucket);
        prt_printf(&buf, "\n  ");
@@ -257,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
                                         struct bch_backpointer bp,
                                         unsigned iter_flags)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
-       struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
-       struct bkey_s_c k;
-
-       bch2_trans_node_iter_init(trans, iter,
-                                 bp.btree_id,
-                                 bp.pos,
-                                 0,
-                                 min(bp.level, r->level),
-                                 iter_flags);
-       k = bch2_btree_iter_peek_slot(iter);
-       if (bkey_err(k)) {
-               bch2_trans_iter_exit(trans, iter);
-               return k;
-       }
-
-       if (bp.level == r->level + 1)
-               k = bkey_i_to_s_c(&r->key);
-
-       if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
-               return k;
-
-       bch2_trans_iter_exit(trans, iter);
+       if (likely(!bp.level)) {
+               struct bch_fs *c = trans->c;
+               struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+               struct bkey_s_c k;
+
+               bch2_trans_node_iter_init(trans, iter,
+                                         bp.btree_id,
+                                         bp.pos,
+                                         0, 0,
+                                         iter_flags);
+               k = bch2_btree_iter_peek_slot(iter);
+               if (bkey_err(k)) {
+                       bch2_trans_iter_exit(trans, iter);
+                       return k;
+               }
 
-       if (unlikely(bch2_backpointers_no_use_write_buffer)) {
-               if (bp.level) {
-                       struct btree *b;
+               if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+                       return k;
 
-                       /*
-                        * If a backpointer for a btree node wasn't found, it may be
-                        * because it was overwritten by a new btree node that hasn't
-                        * been written out yet - backpointer_get_node() checks for
-                        * this:
-                        */
-                       b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
-                       if (!IS_ERR_OR_NULL(b))
-                               return bkey_i_to_s_c(&b->key);
+               bch2_trans_iter_exit(trans, iter);
+               backpointer_not_found(trans, bp_pos, bp, k);
+               return bkey_s_c_null;
+       } else {
+               struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
 
+               if (IS_ERR_OR_NULL(b)) {
                        bch2_trans_iter_exit(trans, iter);
-
-                       if (IS_ERR(b))
-                               return bkey_s_c_err(PTR_ERR(b));
-                       return bkey_s_c_null;
+                       return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
                }
-
-               backpointer_not_found(trans, bp_pos, bp, k, "extent");
+               return bkey_i_to_s_c(&b->key);
        }
-
-       return bkey_s_c_null;
 }
 
 struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@@ -327,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
                                  bp.level - 1,
                                  0);
        b = bch2_btree_iter_peek_node(iter);
-       if (IS_ERR(b))
+       if (IS_ERR_OR_NULL(b))
                goto err;
 
-       if (b && extent_matches_bp(c, bp.btree_id, bp.level,
-                                  bkey_i_to_s_c(&b->key),
-                                  bucket, bp))
+       BUG_ON(b->c.level != bp.level - 1);
+
+       if (extent_matches_bp(c, bp.btree_id, bp.level,
+                             bkey_i_to_s_c(&b->key),
+                             bucket, bp))
                return b;
 
-       if (b && btree_node_will_make_reachable(b)) {
+       if (btree_node_will_make_reachable(b)) {
                b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
        } else {
-               backpointer_not_found(trans, bp_pos, bp,
-                                     bkey_i_to_s_c(&b->key), "btree node");
+               backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
                b = NULL;
        }
 err:
@@ -395,7 +382,7 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
        ret = bch2_trans_run(c,
                for_each_btree_key_commit(trans, iter,
                        BTREE_ID_backpointers, POS_MIN, 0, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                  bch2_check_btree_backpointer(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
@@ -642,8 +629,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 
                do {
                        ret = commit_do(trans, NULL, NULL,
-                                       BTREE_INSERT_LAZY_RW|
-                                       BTREE_INSERT_NOFAIL,
+                                       BCH_TRANS_COMMIT_lazy_rw|
+                                       BCH_TRANS_COMMIT_no_enospc,
                                        check_extent_to_backpointers(trans, &iter,
                                                                bucket_start, bucket_end,
                                                                &last_flushed));
@@ -657,8 +644,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
                        break;
 
                ret = commit_do(trans, NULL, NULL,
-                               BTREE_INSERT_LAZY_RW|
-                               BTREE_INSERT_NOFAIL,
+                               BCH_TRANS_COMMIT_lazy_rw|
+                               BCH_TRANS_COMMIT_no_enospc,
                                check_btree_root_to_backpointers(trans, btree_id,
                                                        bucket_start, bucket_end,
                                                        &last_flushed));
@@ -797,7 +784,8 @@ static int check_one_backpointer(struct btree_trans *trans,
 
        if (fsck_err_on(!k.k, c,
                        backpointer_to_missing_ptr,
-                       "backpointer for missing extent\n  %s",
+                       "backpointer for missing %s\n  %s",
+                       bp.v->level ? "btree node" : "extent",
                        (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
                ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
                goto out;
@@ -819,7 +807,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
        return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
                                  POS_MIN, BTREE_ITER_PREFETCH, k,
-                                 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                                 NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_one_backpointer(trans, start, end,
                                      bkey_s_c_to_backpointer(k),
                                      &last_flushed_pos));
index 9cb8684959ee17affdc2a558c14414477fef922d..3117ab4426a74ae8262f79ccaffc325d28ba4254 100644 (file)
@@ -401,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(journal_flush_write)                  \
        x(journal_noflush_write)                \
        x(journal_flush_seq)                    \
-       x(blocked_journal)                      \
+       x(blocked_journal_low_on_space)         \
+       x(blocked_journal_low_on_pin)           \
+       x(blocked_journal_max_in_flight)        \
        x(blocked_allocate)                     \
        x(blocked_allocate_open_bucket)         \
        x(nocow_lock_contended)
@@ -617,7 +619,7 @@ struct journal_seq_blacklist_table {
                u64             start;
                u64             end;
                bool            dirty;
-       }                       entries[0];
+       }                       entries[];
 };
 
 struct journal_keys {
index 7a1c244071f9254dfdf0c3eca51b79100955bd76..0a750953ff921b9d62d9fd1918da27d375c2c6dc 100644 (file)
@@ -2256,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH,       struct jset, flags, 5, 6);
 enum btree_id_flags {
        BTREE_ID_EXTENTS        = BIT(0),
        BTREE_ID_SNAPSHOTS      = BIT(1),
-       BTREE_ID_DATA           = BIT(2),
+       BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+       BTREE_ID_DATA           = BIT(3),
 };
 
 #define BCH_BTREE_IDS()                                                                \
@@ -2311,12 +2312,12 @@ enum btree_id_flags {
          BIT_ULL(KEY_TYPE_bucket_gens))                                        \
        x(snapshot_trees,       15,     0,                                      \
          BIT_ULL(KEY_TYPE_snapshot_tree))                                      \
-       x(deleted_inodes,       16,     BTREE_ID_SNAPSHOTS,                     \
+       x(deleted_inodes,       16,     BTREE_ID_SNAPSHOT_FIELD,                \
          BIT_ULL(KEY_TYPE_set))                                                \
        x(logged_ops,           17,     0,                                      \
          BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
          BIT_ULL(KEY_TYPE_logged_op_finsert))                                  \
-       x(rebalance_work,       18,     BTREE_ID_SNAPSHOTS,                     \
+       x(rebalance_work,       18,     BTREE_ID_SNAPSHOT_FIELD,                \
          BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
 
 enum btree_id {
index 2f518d7e1a6444fa2f03b86235f850f7190c010c..761f5e33b1e69e94ca0aaaa41a9825e496b5840f 100644 (file)
@@ -186,15 +186,20 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
        if (type != BKEY_TYPE_btree) {
                enum btree_id btree = type - 1;
 
-               bkey_fsck_err_on(!btree_type_has_snapshots(btree) &&
-                                k.k->p.snapshot, c, err,
-                                bkey_snapshot_nonzero,
-                                "nonzero snapshot");
-
-               bkey_fsck_err_on(btree_type_has_snapshots(btree) &&
-                                !k.k->p.snapshot, c, err,
-                                bkey_snapshot_zero,
-                                "snapshot == 0");
+               if (btree_type_has_snapshots(btree)) {
+                       bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+                                        bkey_snapshot_zero,
+                                        "snapshot == 0");
+               } else if (!btree_type_has_snapshot_field(btree)) {
+                       bkey_fsck_err_on(k.k->p.snapshot, c, err,
+                                        bkey_snapshot_nonzero,
+                                        "nonzero snapshot");
+               } else {
+                       /*
+                        * btree uses snapshot field but it's not required to be
+                        * nonzero
+                        */
+               }
 
                bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
                                 bkey_at_pos_max,
index 3a370b7087acea9bed0de0e1c565034336303d01..912adadfb4dd40a3435d7f6a82eba365a750fa67 100644 (file)
@@ -93,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
        __BTREE_UPDATE_NOJOURNAL,
-       __BTREE_UPDATE_PREJOURNAL,
        __BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
@@ -108,7 +107,6 @@ enum btree_update_flags {
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL         (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL                (1U << __BTREE_UPDATE_PREJOURNAL)
 #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
index 7cd517ae5bf49a56bff58aa0da11d896242e56ef..c4922bd30fafa52990cffca0ab19761fa28fc97d 100644 (file)
@@ -1502,7 +1502,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
                ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS(ca->dev_idx, ca->mi.first_bucket),
                                BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_LAZY_RW,
+                               NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
                        bch2_alloc_write_key(trans, &iter, k, metadata_only));
 
                if (ret < 0) {
@@ -1659,7 +1659,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
        ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_reflink, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                bch2_gc_write_reflink_key(trans, &iter, k, &idx));
 
        c->reflink_gc_nr = 0;
@@ -1783,7 +1783,7 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
        ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_stripes, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                bch2_gc_write_stripes_key(trans, &iter, k));
 
        bch2_trans_put(trans);
@@ -2019,7 +2019,7 @@ int bch2_gc_gens(struct bch_fs *c)
                                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                                        k,
                                        NULL, NULL,
-                                       BTREE_INSERT_NOFAIL,
+                                       BCH_TRANS_COMMIT_no_enospc,
                                gc_btree_gens_key(trans, &iter, k));
                        if (ret && !bch2_err_matches(ret, EROFS))
                                bch_err_fn(c, ret);
@@ -2032,7 +2032,7 @@ int bch2_gc_gens(struct bch_fs *c)
                        BTREE_ITER_PREFETCH,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_NOFAIL,
+                       BCH_TRANS_COMMIT_no_enospc,
                bch2_alloc_write_oldest_gen(trans, &iter, k));
        if (ret && !bch2_err_matches(ret, EROFS))
                bch_err_fn(c, ret);
index 37d896edb06e0475cc7146e31a2790321f842394..1f73ee0ee359bbf3c7434e2254ea2361432d968d 100644 (file)
@@ -1801,9 +1801,9 @@ static void btree_node_write_work(struct work_struct *work)
                ret = bch2_trans_do(c, NULL, NULL, 0,
                        bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
                                        BCH_WATERMARK_reclaim|
-                                       BTREE_INSERT_JOURNAL_RECLAIM|
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_NOCHECK_RW,
+                                       BCH_TRANS_COMMIT_journal_reclaim|
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_no_check_rw,
                                        !wbio->wbio.failed.nr));
                if (ret)
                        goto err;
index ba392eb02a57b53493cf6e91fab14b513b51083f..104172f6822b3e26ed5acd8b64df79e9f5971f01 100644 (file)
@@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
        BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              !btree_type_has_snapshots(iter->btree_id));
+              !btree_type_has_snapshot_field(iter->btree_id));
 
        if (iter->update_path)
                bch2_btree_path_verify(trans, iter->update_path);
@@ -1214,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                   struct btree_path *path, struct bpos new_pos,
                   bool intent, unsigned long ip, int cmp)
 {
-       unsigned level = path->level;
-
        bch2_trans_verify_not_in_restart(trans);
        EBUG_ON(!path->ref);
 
@@ -1231,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                goto out;
        }
 
-       level = btree_path_up_until_good_node(trans, path, cmp);
+       unsigned level = btree_path_up_until_good_node(trans, path, cmp);
 
        if (btree_path_node(path, level)) {
                struct btree_path_level *l = &path->l[level];
@@ -2835,8 +2833,9 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
 static inline void check_srcu_held_too_long(struct btree_trans *trans)
 {
-       WARN(time_after(jiffies, trans->srcu_lock_time + HZ * 10),
-            "btree trans held srcu lock (delaying memory reclaim) by more than 10 seconds");
+       WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+            "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+            (jiffies - trans->srcu_lock_time) / HZ);
 }
 
 void bch2_trans_srcu_unlock(struct btree_trans *trans)
@@ -3088,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
                srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
        }
 
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
        kfree(trans->extra_journal_entries.data);
 
        if (trans->fs_usage_deltas) {
index 5e103f519e62ec280863c389cb765904a6becb91..85e7cb52f6b6c41b95c7dc3496a9340c9f2b38d0 100644 (file)
@@ -416,7 +416,7 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
                flags |= BTREE_ITER_IS_EXTENTS;
 
        if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
-           !btree_type_has_snapshots(btree_id))
+           !btree_type_has_snapshot_field(btree_id))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
        if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
index 98aeedb7c22a2062e401803064fe7436e26b23f0..8e80a5b687fe04e685f61b022333befa218c0c9c 100644 (file)
@@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
        ck->btree_trans_barrier_seq =
                start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-       if (ck->c.lock.readers)
+       if (ck->c.lock.readers) {
                list_move_tail(&ck->list, &bc->freed_pcpu);
-       else
+               bc->nr_freed_pcpu++;
+       } else {
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
+       }
        atomic_long_inc(&bc->nr_freed);
 
        kfree(ck->k);
@@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 {
        struct bkey_cached *pos;
 
+       bc->nr_freed_nonpcpu++;
+
        list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
                if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
                                 pos->btree_trans_barrier_seq)) {
@@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 #else
                mutex_lock(&bc->lock);
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
                mutex_unlock(&bc->lock);
 #endif
        } else {
@@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                               f->nr < ARRAY_SIZE(f->objs) / 2) {
                                ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                                list_del_init(&ck->list);
+                               bc->nr_freed_nonpcpu--;
                                f->objs[f->nr++] = ck;
                        }
 
@@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                if (!list_empty(&bc->freed_nonpcpu)) {
                        ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                        list_del_init(&ck->list);
+                       bc->nr_freed_nonpcpu--;
                }
                mutex_unlock(&bc->lock);
 #endif
@@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOCHECK_RW|
-                                 BTREE_INSERT_NOFAIL|
+                                 BCH_TRANS_COMMIT_no_check_rw|
+                                 BCH_TRANS_COMMIT_no_enospc|
                                  (ck->journal.seq == journal_last_seq(j)
                                   ? BCH_WATERMARK_reclaim
                                   : 0)|
@@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                goto out;
 
        bch2_journal_pin_drop(j, &ck->journal);
-       bch2_journal_preres_put(j, &ck->res);
 
        BUG_ON(!btree_node_locked(c_iter.path, 0));
 
@@ -728,7 +735,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
 
        ret = commit_do(trans, NULL, NULL, 0,
                btree_key_cache_flush_pos(trans, key, seq,
-                               BTREE_INSERT_JOURNAL_RECLAIM, false));
+                               BCH_TRANS_COMMIT_journal_reclaim, false));
 unlock:
        srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
@@ -763,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
        BUG_ON(insert->k.u64s > ck->u64s);
 
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               int difference;
-
-               BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
-               difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
-               if (difference > 0) {
-                       trans->journal_preres.u64s      -= difference;
-                       ck->res.u64s                    += difference;
-               }
-       }
-
        bkey_copy(ck->k, insert);
        ck->valid = true;
 
@@ -852,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
         * Newest freed entries are at the end of the list - once we hit one
         * that's too new to be freed, we can bail out:
         */
+       scanned += bc->nr_freed_nonpcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -861,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_nonpcpu--;
        }
 
        if (scanned >= nr)
                goto out;
 
+       scanned += bc->nr_freed_pcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -877,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_pcpu--;
        }
 
        if (scanned >= nr)
@@ -985,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
        }
 #endif
 
+       BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+       BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
        list_splice(&bc->freed_pcpu,    &items);
        list_splice(&bc->freed_nonpcpu, &items);
 
@@ -994,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                cond_resched();
 
                bch2_journal_pin_drop(&c->journal, &ck->journal);
-               bch2_journal_preres_put(&c->journal, &ck->res);
 
                list_del(&ck->list);
                kfree(ck->k);
diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h
new file mode 100644 (file)
index 0000000..cfd09f5
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+       struct bkey_cached      *objs[16];
+       unsigned                nr;
+};
+
+struct btree_key_cache {
+       struct mutex            lock;
+       struct rhashtable       table;
+       bool                    table_init_done;
+
+       struct list_head        freed_pcpu;
+       size_t                  nr_freed_pcpu;
+       struct list_head        freed_nonpcpu;
+       size_t                  nr_freed_nonpcpu;
+
+       struct shrinker         shrink;
+       unsigned                shrink_iter;
+       struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+       atomic_long_t           nr_freed;
+       atomic_long_t           nr_keys;
+       atomic_long_t           nr_dirty;
+};
+
+struct bkey_cached_key {
+       u32                     btree_id;
+       struct bpos             pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
index 32693f7c6221043d0c28b07e57f1b396bd845582..70077efae7889bb143195821b0d039d2dca64a89 100644 (file)
@@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
                bch2_btree_init_next(trans, b);
 }
 
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       EBUG_ON(trans->write_locked);
+
+       trans_for_each_update(trans, i) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+                       return trans_lock_write_fail(trans, i);
+
+               if (!i->cached)
+                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trans->write_locked = true;
+       return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+       if (likely(trans->write_locked)) {
+               struct btree_insert_entry *i;
+
+               trans_for_each_update(trans, i)
+                       if (!same_leaf_as_prev(trans, i))
+                               bch2_btree_node_unlock_write_inlined(trans, i->path,
+                                                                    insert_l(i)->b);
+               trans->write_locked = false;
+       }
+}
+
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
@@ -269,23 +316,13 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
        BUG_ON(i->level         != i->path->level);
        BUG_ON(i->btree_id      != i->path->btree_id);
        EBUG_ON(!i->level &&
+               btree_type_has_snapshots(i->btree_id) &&
                !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
                test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
                i->k->k.p.snapshot &&
                bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-                                  unsigned long trace_ip)
-{
-       return drop_locks_do(trans,
-               bch2_journal_preres_get(&trans->c->journal,
-                       &trans->journal_preres,
-                       trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)));
-}
-
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
                                                      unsigned flags)
 {
@@ -320,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
        return 0;
 }
 
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+                                    struct btree_path *path, unsigned new_u64s)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct bkey_cached *ck = (void *) path->l[0].b;
+       struct bkey_i *new_k;
+       int ret;
+
+       bch2_trans_unlock_write(trans);
+       bch2_trans_unlock(trans);
+
+       new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+       if (!new_k) {
+               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+                       bch2_btree_id_str(path->btree_id), new_u64s);
+               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+       }
+
+       ret =   bch2_trans_relock(trans) ?:
+               bch2_trans_lock_write(trans);
+       if (unlikely(ret)) {
+               kfree(new_k);
+               return ret;
+       }
+
+       memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+       trans_for_each_update(trans, i)
+               if (i->old_v == &ck->k->v)
+                       i->old_v = &new_k->v;
+
+       kfree(ck->k);
+       ck->u64s        = new_u64s;
+       ck->k           = new_k;
+       return 0;
+}
+
 static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
                                       struct btree_path *path, unsigned u64s)
 {
@@ -333,7 +409,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
            bch2_btree_key_cache_must_wait(c) &&
-           !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+           !(flags & BCH_TRANS_COMMIT_journal_reclaim))
                return -BCH_ERR_btree_insert_need_journal_reclaim;
 
        /*
@@ -346,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
                return 0;
 
        new_u64s        = roundup_pow_of_two(u64s);
-       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-       if (!new_k) {
-               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-                       bch2_btree_id_str(path->btree_id), new_u64s);
-               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-       }
+       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+       if (unlikely(!new_k))
+               return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
 
        trans_for_each_update(trans, i)
                if (i->old_v == &ck->k->v)
@@ -583,6 +656,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                        *stopped_at = i;
                        return ret;
                }
+
+               i->k->k.needs_whiteout = false;
        }
 
        if (trans->nr_wb_updates &&
@@ -593,7 +668,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
         * Don't get journal reservation until after we know insert will
         * succeed:
         */
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
                ret = bch2_trans_journal_res_get(trans,
                                (flags & BCH_WATERMARK_MASK)|
                                JOURNAL_RES_GET_NONBLOCK);
@@ -602,8 +677,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
                if (unlikely(trans->journal_transaction_names))
                        journal_transaction_name(trans);
-       } else {
-               trans->journal_res.seq = c->journal.replay_journal_seq;
        }
 
        /*
@@ -612,7 +685,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
         */
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+           !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
                if (bch2_journal_seq_verify)
                        trans_for_each_update(trans, i)
                                i->k->k.version.lo = trans->journal_res.seq;
@@ -626,7 +699,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                return -BCH_ERR_btree_insert_need_mark_replicas;
 
        if (trans->nr_wb_updates) {
-               EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
+               EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
 
                ret = bch2_btree_insert_keys_write_buffer(trans);
                if (ret)
@@ -663,7 +736,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                trans->journal_res.u64s         -= trans->extra_journal_entries.nr;
        }
 
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
                struct journal *j = &c->journal;
                struct jset_entry *entry;
 
@@ -705,15 +778,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
        }
 
        trans_for_each_update(trans, i) {
-               i->k->k.needs_whiteout = false;
-
                if (!i->cached) {
-                       u64 seq = trans->journal_res.seq;
-
-                       if (i->flags & BTREE_UPDATE_PREJOURNAL)
-                               seq = i->seq;
-
-                       bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+                       bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
                } else if (!i->key_cache_already_flushed)
                        bch2_btree_insert_key_cached(trans, flags, i);
                else {
@@ -731,37 +797,6 @@ revert_fs_usage:
        return ret;
 }
 
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       while (--i >= trans->updates) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-       }
-
-       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-                       return trans_lock_write_fail(trans, i);
-
-               if (!i->cached)
-                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-       }
-
-       return 0;
-}
-
 static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
 {
        struct btree_insert_entry *i;
@@ -799,6 +834,12 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
        return -EINVAL;
 }
 
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
 /*
  * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
@@ -829,15 +870,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
                }
        }
 
-       ret = bch2_journal_preres_get(&c->journal,
-                       &trans->journal_preres, trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-       if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-               ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-       if (unlikely(ret))
-               return ret;
-
-       ret = trans_lock_write(trans);
+       ret = bch2_trans_lock_write(trans);
        if (unlikely(ret))
                return ret;
 
@@ -846,20 +879,19 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
        if (!ret && unlikely(trans->journal_replay_not_finished))
                bch2_drop_overwrites_from_journal(trans);
 
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(trans, i->path,
-                                                       insert_l(i)->b);
+       bch2_trans_unlock_write(trans);
 
        if (!ret && trans->journal_pin)
                bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-                                    trans->journal_pin, NULL);
+                                    trans->journal_pin,
+                                    bch2_trans_commit_journal_pin_flush);
 
        /*
         * Drop journal reservation after dropping write locks, since dropping
         * the journal reservation may kick off a journal write:
         */
-       bch2_journal_res_put(&c->journal, &trans->journal_res);
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+               bch2_journal_res_put(&c->journal, &trans->journal_res);
 
        return ret;
 }
@@ -896,7 +928,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
                 * flag
                 */
-               if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+               if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
                    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        break;
@@ -931,7 +963,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
                        if (wb->state.nr > wb->size * 3 / 4) {
                                bch2_trans_begin(trans);
                                ret = __bch2_btree_write_buffer_flush(trans,
-                                               flags|BTREE_INSERT_NOCHECK_RW, true);
+                                               flags|BCH_TRANS_COMMIT_no_check_rw, true);
                                if (!ret) {
                                        trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
                                        ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@@ -951,8 +983,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
        BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
 
        bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-                               !(flags & BTREE_INSERT_NOWAIT) &&
-                               (flags & BTREE_INSERT_NOFAIL), c,
+                               (flags & BCH_TRANS_COMMIT_no_enospc), c,
                "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
 
        return ret;
@@ -964,7 +995,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
        struct bch_fs *c = trans->c;
        int ret;
 
-       if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
+       if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
            test_bit(BCH_FS_STARTED, &c->flags))
                return -BCH_ERR_erofs_trans_commit;
 
@@ -1002,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i = NULL;
        struct btree_write_buffered_key *wb;
-       unsigned u64s;
        int ret = 0;
 
        if (!trans->nr_updates &&
@@ -1010,9 +1040,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
            !trans->extra_journal_entries.nr)
                goto out_reset;
 
-       if (flags & BTREE_INSERT_GC_LOCK_HELD)
-               lockdep_assert_held(&c->gc_lock);
-
        ret = bch2_trans_commit_run_triggers(trans);
        if (ret)
                goto out_reset;
@@ -1021,7 +1048,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                struct printbuf buf = PRINTBUF;
                enum bkey_invalid_flags invalid_flags = 0;
 
-               if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+               if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
                        invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
 
                if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@@ -1039,7 +1066,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                goto out_reset;
        }
 
-       if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+       if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
            unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
                ret = bch2_trans_commit_get_rw_cold(trans, flags);
                if (ret)
@@ -1052,7 +1079,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                bch2_trans_unlock(trans);
 
                ret = __bch2_btree_write_buffer_flush(trans,
-                                       flags|BTREE_INSERT_NOCHECK_RW, true);
+                                       flags|BCH_TRANS_COMMIT_no_check_rw, true);
                if (!ret) {
                        trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
                        ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
@@ -1062,13 +1089,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 
        EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
-       memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
        trans->journal_u64s             = trans->extra_journal_entries.nr;
-       trans->journal_preres_u64s      = 0;
-
        trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
        if (trans->journal_transaction_names)
                trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
 
@@ -1084,16 +1106,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                if (i->key_cache_already_flushed)
                        continue;
 
-               /* we're going to journal the key being updated: */
-               u64s = jset_u64s(i->k->k.u64s);
-               if (i->cached &&
-                   likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-                       trans->journal_preres_u64s += u64s;
-
                if (i->flags & BTREE_UPDATE_NOJOURNAL)
                        continue;
 
-               trans->journal_u64s += u64s;
+               /* we're going to journal the key being updated: */
+               trans->journal_u64s += jset_u64s(i->k->k.u64s);
 
                /* and we're also going to log the overwrite: */
                if (trans->journal_transaction_names)
@@ -1106,14 +1123,15 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
        if (trans->extra_journal_res) {
                ret = bch2_disk_reservation_add(c, trans->disk_res,
                                trans->extra_journal_res,
-                               (flags & BTREE_INSERT_NOFAIL)
+                               (flags & BCH_TRANS_COMMIT_no_enospc)
                                ? BCH_DISK_RESERVATION_NOFAIL : 0);
                if (ret)
                        goto err;
        }
 retry:
        bch2_trans_verify_not_in_restart(trans);
-       memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+               memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
        ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
 
@@ -1125,9 +1143,7 @@ retry:
 
        trace_and_count(c, transaction_commit, trans, _RET_IP_);
 out:
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
-       if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
                bch2_write_ref_put(c, BCH_WRITE_REF_trans);
 out_reset:
        if (!ret)
@@ -1140,5 +1156,17 @@ err:
        if (ret)
                goto out;
 
+       /*
+        * We might have done another transaction commit in the error path -
+        * i.e. btree write buffer flush - which will have made use of
+        * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+        * how the journal sequence number to pin is passed in - so we must
+        * restart:
+        */
+       if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+               ret = -BCH_ERR_transaction_restart_nested;
+               goto out;
+       }
+
        goto retry;
 }
index 4b9cc61a4a6098f122c4d04a7657699ec20959dd..a3f24cd0043dbbf19783063db9cc45205ab53b79 100644 (file)
@@ -5,7 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
 #include "errcode.h"
@@ -322,31 +322,6 @@ struct btree_iter {
 #endif
 };
 
-struct btree_key_cache_freelist {
-       struct bkey_cached      *objs[16];
-       unsigned                nr;
-};
-
-struct btree_key_cache {
-       struct mutex            lock;
-       struct rhashtable       table;
-       bool                    table_init_done;
-       struct list_head        freed_pcpu;
-       struct list_head        freed_nonpcpu;
-       struct shrinker         shrink;
-       unsigned                shrink_iter;
-       struct btree_key_cache_freelist __percpu *pcpu_freed;
-
-       atomic_long_t           nr_freed;
-       atomic_long_t           nr_keys;
-       atomic_long_t           nr_dirty;
-};
-
-struct bkey_cached_key {
-       u32                     btree_id;
-       struct bpos             pos;
-} __packed __aligned(4);
-
 #define BKEY_CACHED_ACCESSED           0
 #define BKEY_CACHED_DIRTY              1
 
@@ -362,7 +337,6 @@ struct bkey_cached {
        struct rhash_head       hash;
        struct list_head        list;
 
-       struct journal_preres   res;
        struct journal_entry_pin journal;
        u64                     seq;
 
@@ -392,7 +366,6 @@ struct btree_insert_entry {
        u8                      old_btree_u64s;
        struct bkey_i           *k;
        struct btree_path       *path;
-       u64                     seq;
        /* key being overwritten: */
        struct bkey             old_k;
        const struct bch_val    *old_v;
@@ -441,6 +414,7 @@ struct btree_trans {
        bool                    journal_replay_not_finished:1;
        bool                    is_initial_gc:1;
        bool                    notrace_relock_fail:1;
+       bool                    write_locked:1;
        enum bch_errcode        restarted:16;
        u32                     restart_count;
        unsigned long           last_begin_ip;
@@ -472,11 +446,9 @@ struct btree_trans {
        struct journal_entry_pin *journal_pin;
 
        struct journal_res      journal_res;
-       struct journal_preres   journal_preres;
        u64                     *journal_seq;
        struct disk_reservation *disk_res;
        unsigned                journal_u64s;
-       unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
 };
 
@@ -717,6 +689,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
        return (1U << id) & mask;
 }
 
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+       const unsigned mask = 0
+#define x(name, nr, flags, ...)        |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+       BCH_BTREE_IDS()
+#undef x
+       ;
+
+       return (1U << id) & mask;
+}
+
 static inline bool btree_type_has_ptrs(enum btree_id id)
 {
        const unsigned mask = 0
index 324767c0ddccd7457004a34e8ed6e49da8c54b85..1837f84845696dd8fa515a830ac97d2a26e36be5 100644 (file)
@@ -380,21 +380,12 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i, n;
-       u64 seq = 0;
        int cmp;
 
        EBUG_ON(!path->should_be_locked);
        EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
        EBUG_ON(!bpos_eq(k->k.p, path->pos));
 
-       /*
-        * The transaction journal res hasn't been allocated at this point.
-        * That occurs at commit time. Reuse the seq field to pass in the seq
-        * of a prejournaled key.
-        */
-       if (flags & BTREE_UPDATE_PREJOURNAL)
-               seq = trans->journal_res.seq;
-
        n = (struct btree_insert_entry) {
                .flags          = flags,
                .bkey_type      = __btree_node_type(path->level, path->btree_id),
@@ -403,7 +394,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
                .cached         = path->cached,
                .path           = path,
                .k              = k,
-               .seq            = seq,
                .ip_allocated   = ip,
        };
 
@@ -431,7 +421,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
                i->cached       = n.cached;
                i->k            = n.k;
                i->path         = n.path;
-               i->seq          = n.seq;
                i->ip_allocated = n.ip_allocated;
        } else {
                array_insert_item(trans->updates, trans->nr_updates,
@@ -542,18 +531,6 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
        return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
 }
 
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
-                                      struct btree_iter *iter, struct bkey_i *k,
-                                      enum btree_update_flags flags)
-{
-       trans->journal_res.seq = seq;
-       return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
-                                                BTREE_UPDATE_PREJOURNAL);
-}
-
 int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
                                            enum btree_id btree,
                                            struct bkey_i *k)
@@ -792,7 +769,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
 
                ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
                        bch2_trans_commit(trans, &disk_res, journal_seq,
-                                         BTREE_INSERT_NOFAIL);
+                                         BCH_TRANS_COMMIT_no_enospc);
                bch2_disk_reservation_put(trans->c, &disk_res);
 err:
                /*
@@ -897,7 +874,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
                ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
        } else {
                ret = bch2_trans_do(c, NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|commit_flags,
+                       BCH_TRANS_COMMIT_lazy_rw|commit_flags,
                        __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
        }
 
index 9816d22865403043c6caa819b3f249a2e10ea6fa..14a2315aa88e4267775c910f3119728f3f5579dc 100644 (file)
@@ -21,37 +21,28 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
                                struct bkey_i *, u64);
 
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS()                                                       \
+       x(no_enospc,    "don't check for enospc")                                       \
+       x(no_check_rw,  "don't attempt to take a ref on c->writes")                     \
+       x(lazy_rw,      "go read-write if we haven't yet - only for use in recovery")   \
+       x(no_journal_res, "don't take a journal reservation, instead "                  \
+                       "pin journal entry referred to by trans->journal_res.seq")      \
+       x(journal_reclaim, "operation required for journal reclaim; may return error"   \
+                       "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
        /* First bits for bch_watermark: */
-       __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
-       __BTREE_INSERT_NOCHECK_RW,
-       __BTREE_INSERT_LAZY_RW,
-       __BTREE_INSERT_JOURNAL_REPLAY,
-       __BTREE_INSERT_JOURNAL_RECLAIM,
-       __BTREE_INSERT_NOWAIT,
-       __BTREE_INSERT_GC_LOCK_HELD,
-       __BCH_HASH_SET_MUST_CREATE,
-       __BCH_HASH_SET_MUST_REPLACE,
+       __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...)      __BCH_TRANS_COMMIT_##n,
+       BCH_TRANS_COMMIT_FLAGS()
+#undef x
 };
 
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL            BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW                BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW           BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY    BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM   BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT            BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD      BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE       BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE      BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...)      BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+       BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
 
 int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
                                unsigned, unsigned);
index 89ada89eafe7b79096f806da7541b94939fd2da1..9affcb22d9cb7025453cfd528f13738ff9514879 100644 (file)
@@ -475,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
        /*
         * Protects reaping from the btree node cache and using the btree node
         * open bucket reserve:
-        *
-        * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
-        * blocking on this lock:
         */
        ret = bch2_btree_cache_cannibalize_lock(c, cl);
        if (ret)
@@ -487,9 +484,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
                struct prealloc_nodes *p = as->prealloc_nodes + interior;
 
                while (p->nr < nr_nodes[interior]) {
-                       b = __bch2_btree_node_alloc(trans, &as->disk_res,
-                                       flags & BTREE_INSERT_NOWAIT ? NULL : cl,
-                                       interior, flags);
+                       b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+                                                   interior, flags);
                        if (IS_ERR(b)) {
                                ret = PTR_ERR(b);
                                goto err;
@@ -513,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
                up_read(&c->gc_lock);
        as->took_gc_lock = false;
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
        bch2_disk_reservation_put(c, &as->disk_res);
@@ -646,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
         */
        ret = commit_do(trans, &as->disk_res, &journal_seq,
                        BCH_WATERMARK_reclaim|
-                       BTREE_INSERT_NOFAIL|
-                       BTREE_INSERT_NOCHECK_RW|
-                       BTREE_INSERT_JOURNAL_RECLAIM,
+                       BCH_TRANS_COMMIT_no_enospc|
+                       BCH_TRANS_COMMIT_no_check_rw|
+                       BCH_TRANS_COMMIT_journal_reclaim,
                        btree_update_nodes_written_trans(trans, as));
        bch2_trans_unlock(trans);
 
@@ -734,8 +728,6 @@ err:
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        mutex_lock(&c->btree_interior_update_lock);
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
@@ -818,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
 static void btree_update_reparent(struct btree_update *as,
                                  struct btree_update *child)
 {
@@ -828,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
        child->b = NULL;
        child->mode = BTREE_INTERIOR_UPDATING_AS;
 
-       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+                             bch2_update_reparent_journal_pin_flush);
 }
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -937,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
                        b->ob.v[--b->ob.nr];
 }
 
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
 /*
  * @b is being split/rewritten: it may have pointers to not-yet-written btree
  * nodes and thus outstanding btree_updates - redirect @b's
@@ -988,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         * when the new nodes are persistent and reachable on disk:
         */
        w = btree_current_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+                             bch2_btree_update_will_free_node_journal_pin_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        w = btree_prev_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+                             bch2_btree_update_will_free_node_journal_pin_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1042,7 +1049,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        struct bch_fs *c = trans->c;
        struct btree_update *as;
        u64 start_time = local_clock();
-       int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+       int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
        unsigned nr_nodes[2] = { 0, 0 };
        unsigned update_level = level;
@@ -1061,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        flags &= ~BCH_WATERMARK_MASK;
        flags |= watermark;
 
-       if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+       if (flags & BCH_TRANS_COMMIT_journal_reclaim)
                journal_flags |= JOURNAL_RES_GET_NONBLOCK;
        journal_flags |= watermark;
 
@@ -1087,9 +1094,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
                split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
        }
 
-       if (flags & BTREE_INSERT_GC_LOCK_HELD)
-               lockdep_assert_held(&c->gc_lock);
-       else if (!down_read_trylock(&c->gc_lock)) {
+       if (!down_read_trylock(&c->gc_lock)) {
                ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
                if (ret) {
                        up_read(&c->gc_lock);
@@ -1103,7 +1108,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        as->c           = c;
        as->start_time  = start_time;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
-       as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+       as->took_gc_lock = true;
        as->btree_id    = path->btree_id;
        as->update_level = update_level;
        INIT_LIST_HEAD(&as->list);
@@ -1129,27 +1134,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        if (ret)
                goto err;
 
-       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                     BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
-       if (ret) {
-               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
-                       goto err;
-               }
-
-               ret = drop_locks_do(trans,
-                       bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                             BTREE_UPDATE_JOURNAL_RES,
-                                             journal_flags));
-               if (ret == -BCH_ERR_journal_preres_get_blocked) {
-                       trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
-                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
-               }
-               if (ret)
-                       goto err;
-       }
-
        ret = bch2_disk_reservation_get(c, &as->disk_res,
                        (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
                        c->opts.metadata_replicas,
@@ -1167,7 +1151,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
                 * flag
                 */
                if (bch2_err_matches(ret, ENOSPC) &&
-                   (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+                   (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
                    watermark != BCH_WATERMARK_reclaim) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        goto err;
@@ -1855,7 +1839,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        parent = btree_node_parent(path, b);
        as = bch2_btree_update_start(trans, path, level, false,
-                                    BTREE_INSERT_NOFAIL|flags);
+                                    BCH_TRANS_COMMIT_no_enospc|flags);
        ret = PTR_ERR_OR_ZERO(as);
        if (ret)
                goto err;
@@ -1941,7 +1925,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
        struct btree_update *as;
        int ret;
 
-       flags |= BTREE_INSERT_NOFAIL;
+       flags |= BCH_TRANS_COMMIT_no_enospc;
 
        parent = btree_node_parent(iter->path, b);
        as = bch2_btree_update_start(trans, iter->path, b->c.level,
@@ -2418,23 +2402,17 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
 
 struct jset_entry *
 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
-                                   struct jset_entry *start,
-                                   struct jset_entry *end)
+                                   struct jset_entry *end,
+                                   unsigned long skip)
 {
-       struct jset_entry *entry;
-       unsigned long have = 0;
        unsigned i;
 
-       for (entry = start; entry < end; entry = vstruct_next(entry))
-               if (entry->type == BCH_JSET_ENTRY_btree_root)
-                       __set_bit(entry->btree_id, &have);
-
        mutex_lock(&c->btree_root_lock);
 
        for (i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
-               if (r->alive && !test_bit(i, &have)) {
+               if (r->alive && !test_bit(i, &skip)) {
                        journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
                                          i, r->level, &r->key, r->key.k.u64s);
                        end = vstruct_next(end);
index c2ffeb30884d795ebbdcdc08f7804c1803779352..031076e75fa1322a82a202e150a8eca9a75c063e 100644 (file)
@@ -55,7 +55,6 @@ struct btree_update {
        unsigned                        update_level;
 
        struct disk_reservation         disk_res;
-       struct journal_preres           journal_preres;
 
        /*
         * BTREE_INTERIOR_UPDATING_NODE:
@@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
-                                       struct jset_entry *, struct jset_entry *);
+                                       struct jset_entry *, unsigned long);
 
 void bch2_do_pending_node_rewrites(struct bch_fs *);
 void bch2_free_pending_node_rewrites(struct bch_fs *);
index 76b6f2dcaa4fb7889b2e3de3b5a48d57b7723868..a6bf6ed37ced60cfee4bb61c15c47c06d5ace9c7 100644 (file)
@@ -9,9 +9,11 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 
-#include <linux/atomic.h>
 #include <linux/sort.h>
 
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+                               struct journal_entry_pin *, u64);
+
 static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
 {
        const struct btree_write_buffered_key *l = _l;
@@ -46,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
        if (ret)
                return ret;
 
+       /*
+        * We can't clone a path that has write locks: unshare it now, before
+        * set_pos and traverse():
+        */
+       if (iter->path->ref > 1)
+               iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
        path = iter->path;
 
        if (!*write_locked) {
@@ -65,24 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 
        bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
        (*fast)++;
-
-       if (path->ref > 1) {
-               /*
-                * We can't clone a path that has write locks: if the path is
-                * shared, unlock before set_pos(), traverse():
-                */
-               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-               *write_locked = false;
-       }
        return 0;
 trans_commit:
-       return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
-                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+       trans->journal_res.seq = wb->journal_seq;
+
+       return  bch2_trans_update(trans, iter, &wb->k,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  commit_flags|
-                                 BTREE_INSERT_NOCHECK_RW|
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_JOURNAL_RECLAIM);
+                                 BCH_TRANS_COMMIT_no_check_rw|
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_no_journal_res|
+                                 BCH_TRANS_COMMIT_journal_reclaim);
 }
 
 static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
@@ -125,9 +128,11 @@ btree_write_buffered_insert(struct btree_trans *trans,
        bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
                             BTREE_ITER_CACHED|BTREE_ITER_INTENT);
 
+       trans->journal_res.seq = wb->journal_seq;
+
        ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
-                                     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               bch2_trans_update(trans, &iter, &wb->k,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -151,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
        if (!locked && !mutex_trylock(&wb->flush_lock))
                return 0;
 
-       bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+       bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
+                             bch2_btree_write_buffer_journal_flush);
        bch2_journal_pin_drop(j, &wb->journal_pin);
 
        s = btree_write_buffer_switch(wb);
@@ -169,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         * However, since we're not flushing in the order they appear in the
         * journal we won't be able to drop our journal pin until everything is
         * flushed - which means this could deadlock the journal if we weren't
-        * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+        * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
         * if it would block taking a journal reservation.
         *
         * If that happens, simply skip the key so we can optimistically insert
@@ -253,21 +259,14 @@ slowpath:
                if (!i->journal_seq)
                        continue;
 
-               if (i->journal_seq > pin.seq) {
-                       struct journal_entry_pin pin2;
-
-                       memset(&pin2, 0, sizeof(pin2));
-
-                       bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
-                       bch2_journal_pin_drop(j, &pin);
-                       bch2_journal_pin_copy(j, &pin, &pin2, NULL);
-                       bch2_journal_pin_drop(j, &pin2);
-               }
+               bch2_journal_pin_update(j, i->journal_seq, &pin,
+                             bch2_btree_write_buffer_journal_flush);
 
                ret = commit_do(trans, NULL, NULL,
                                commit_flags|
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_JOURNAL_RECLAIM,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_no_journal_res|
+                               BCH_TRANS_COMMIT_journal_reclaim,
                                btree_write_buffered_insert(trans, i));
                if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
                        break;
@@ -297,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
        mutex_lock(&wb->flush_lock);
 
        return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+                       __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c
new file mode 100644 (file)
index 0000000..aae07be
--- /dev/null
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include "darray.h"
+
+int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+       if (new_size > d->size) {
+               new_size = roundup_pow_of_two(new_size);
+
+               void *data = krealloc_array(d->data, new_size, element_size, gfp);
+               if (!data)
+                       return -ENOMEM;
+
+               d->data = data;
+               d->size = new_size;
+       }
+
+       return 0;
+}
index 87b4b2d1ec766f65e97c40cab918b256015ae3ed..43ea21ad9ea338931e0cb7a54d13bf9f50874b77 100644 (file)
@@ -8,7 +8,6 @@
  * Inspired by CCAN's darray
  */
 
-#include "util.h"
 #include <linux/slab.h>
 
 #define DARRAY(type)                                                   \
@@ -19,20 +18,25 @@ struct {                                                            \
 
 typedef DARRAY(void) darray_void;
 
-static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_void *d, size_t element_size,
+                                 size_t new_size, gfp_t gfp)
 {
-       if (d->nr + more > d->size) {
-               size_t new_size = roundup_pow_of_two(d->nr + more);
-               void *data = krealloc_array(d->data, new_size, t_size, gfp);
+       return unlikely(new_size > d->size)
+               ? __bch2_darray_resize(d, element_size, new_size, gfp)
+               : 0;
+}
 
-               if (!data)
-                       return -ENOMEM;
+#define darray_resize_gfp(_d, _new_size, _gfp)                         \
+       __darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
 
-               d->data = data;
-               d->size = new_size;
-       }
+#define darray_resize(_d, _new_size)                                   \
+       darray_resize_gfp(_d, _new_size, GFP_KERNEL)
 
-       return 0;
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+       return __darray_resize(d, t_size, d->nr + more, gfp);
 }
 
 #define darray_make_room_gfp(_d, _more, _gfp)                          \
@@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
 #define darray_make_room(_d, _more)                                    \
        darray_make_room_gfp(_d, _more, GFP_KERNEL)
 
+#define darray_room(_d)                ((_d).size - (_d).nr)
+
 #define darray_top(_d)         ((_d).data[(_d).nr])
 
 #define darray_push_gfp(_d, _item, _gfp)                               \
index 0771a6d880bf5e2e4efcbcc21d91d34b64160dd4..55769d77e6e79c188fd5a94fb6db264a0066f8a1 100644 (file)
@@ -239,6 +239,34 @@ restart_drop_extra_replicas:
 
                next_pos = insert->k.p;
 
+               /*
+                * Check for nonce offset inconsistency:
+                * This is debug code - we've been seeing this bug rarely, and
+                * it's been hard to reproduce, so this should give us some more
+                * information when it does occur:
+                */
+               struct printbuf err = PRINTBUF;
+               int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+               printbuf_exit(&err);
+
+               if (invalid) {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "about to insert invalid key in data update path");
+                       prt_str(&buf, "\nold: ");
+                       bch2_bkey_val_to_text(&buf, c, old);
+                       prt_str(&buf, "\nk:   ");
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       prt_str(&buf, "\nnew: ");
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+                       bch2_print_string_as_lines(KERN_ERR, buf.buf);
+                       printbuf_exit(&buf);
+
+                       bch2_fatal_error(c);
+                       goto out;
+               }
+
                ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
                                                k.k->p, bkey_start_pos(&insert->k)) ?:
                        bch2_insert_snapshot_whiteouts(trans, m->btree_id,
@@ -250,8 +278,8 @@ restart_drop_extra_replicas:
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL|
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc|
                                m->data_opts.btree_insert_flags);
                if (!ret) {
                        bch2_btree_iter_set_pos(&iter, next_pos);
index 1a0f2d5715692baa2f26a088c61b55742e03fec3..0542d9948c24d42d1cf702d044b861b3f55af13e 100644 (file)
@@ -201,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
                       const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
-                      u64 *dir_offset, int flags)
+                      u64 *dir_offset,
+                      bch_str_hash_flags_t str_hash_flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
@@ -212,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
                return ret;
 
        ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                           dir, &dirent->k_i, flags);
+                           dir, &dirent->k_i, str_hash_flags);
        *dir_offset = dirent->k.p.offset;
 
        return ret;
index cd262bf4d9c5365747562f22536309dc5853d070..8a55245547ba0ad079568db4b4bb9aab679652f2 100644 (file)
@@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
 
 int bch2_dirent_create(struct btree_trans *, subvol_inum,
                       const struct bch_hash_info *, u8,
-                      const struct qstr *, u64, u64 *, int);
+                      const struct qstr *, u64, u64 *,
+                      bch_str_hash_flags_t);
 
 static inline unsigned vfs_d_type(unsigned type)
 {
index d613695abf9f67c2e9f2ab4ce91d863bdfd743c7..4d0cb0ccff32f2c75fa66f932f517f00b9cfdf25 100644 (file)
@@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
        case TARGET_DEV: {
                struct bch_dev *ca;
 
+               out->atomic++;
                rcu_read_lock();
                ca = t.dev < c->sb.nr_devices
                        ? rcu_dereference(c->devs[t.dev])
@@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
                }
 
                rcu_read_unlock();
+               out->atomic--;
                break;
        }
        case TARGET_GROUP:
@@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
        }
 }
 
-void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
        struct target t = target_decode(v);
 
index 5da0e7a69323a5ca7d99cc5a18b4b9561027256f..c730f0933d29a9f63aec199385914fad6d083762 100644 (file)
@@ -150,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
                prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
                if (i < nr_data)
                        prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+               prt_printf(out, " gen %u", ptr->gen);
                if (ptr_stale(ca, ptr))
                        prt_printf(out, " stale");
        }
@@ -303,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                        struct bch_csum got = ec_block_checksum(buf, i, offset);
 
                        if (bch2_crc_cmp(want, got)) {
-                               struct printbuf buf2 = PRINTBUF;
+                               struct printbuf err = PRINTBUF;
+                               struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+                               prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+                                          want.hi, want.lo,
+                                          got.hi, got.lo,
+                                          bch2_csum_types[v->csum_type]);
+                               prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
+                               bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+                               bch_err_ratelimited(ca, "%s", err.buf);
+                               printbuf_exit(&err);
 
-                               bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
-
-                               bch_err_ratelimited(c,
-                                       "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
-                                       (void *) _RET_IP_, i, j, v->csum_type,
-                                       want.lo, got.lo, buf2.buf);
-                               printbuf_exit(&buf2);
                                clear_bit(i, buf->valid);
+
+                               bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
                                break;
                        }
 
@@ -475,14 +481,10 @@ err:
        return ret;
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
-       return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
-}
-
 /* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
 {
+       struct bch_fs *c = trans->c;
        struct ec_stripe_buf *buf;
        struct closure cl;
        struct bch_stripe *v;
@@ -497,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
        if (!buf)
                return -BCH_ERR_ENOMEM_ec_read_extent;
 
-       ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+       ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
        if (ret) {
                bch_err_ratelimited(c,
                        "error doing reconstruct read: error %i looking up stripe", ret);
@@ -801,7 +803,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
                if (!idx)
                        break;
 
-               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                                ec_stripe_delete(trans, idx));
                if (ret) {
                        bch_err_fn(c, ret);
@@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
        while (1) {
                ret = commit_do(trans, NULL, NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL,
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc,
                        ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
                                                s, &bp_pos));
                if (ret)
@@ -1119,8 +1121,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        }
 
        ret = bch2_trans_do(c, &s->res, NULL,
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_NOFAIL,
+                           BCH_TRANS_COMMIT_no_check_rw|
+                           BCH_TRANS_COMMIT_no_enospc,
                            ec_stripe_key_update(trans,
                                        bkey_i_to_stripe(&s->new_stripe.key),
                                        !s->have_existing_stripe));
@@ -1371,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
                        h->nr_active_devs++;
 
        rcu_read_unlock();
+
+       /*
+        * If we only have redundancy + 1 devices, we're better off with just
+        * replication:
+        */
+       if (h->nr_active_devs < h->redundancy + 2)
+               bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+                       h->nr_active_devs, h->redundancy + 2);
+
        list_add(&h->list, &c->ec_stripe_head_list);
        return h;
 }
@@ -1422,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
 
        h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
 found:
+       if (!IS_ERR_OR_NULL(h) &&
+           h->nr_active_devs < h->redundancy + 2) {
+               mutex_unlock(&h->lock);
+               h = NULL;
+       }
        mutex_unlock(&c->ec_stripe_head_lock);
        return h;
 }
@@ -1679,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
        int ret;
 
        h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
-       if (!h)
-               bch_err(c, "no stripe head");
        if (IS_ERR_OR_NULL(h))
                return h;
 
index 61c67aa0aa49a202a7e3d4ddee3531308b45a7e0..7d0237c9819f1a42561f5ec81512e1c4278d12fd 100644 (file)
@@ -199,7 +199,7 @@ struct ec_stripe_head {
        struct ec_stripe_new    *s;
 };
 
-int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
 
index 68a1a96bb7caf526a148a988c12913151c57b6d5..e5c3262cc3032d33e561b21a4a380568ae38f917 100644 (file)
@@ -73,7 +73,6 @@
        x(ENOMEM,                       ENOMEM_fsck_add_nlink)                  \
        x(ENOMEM,                       ENOMEM_journal_key_insert)              \
        x(ENOMEM,                       ENOMEM_journal_keys_sort)               \
-       x(ENOMEM,                       ENOMEM_journal_replay)                  \
        x(ENOMEM,                       ENOMEM_read_superblock_clean)           \
        x(ENOMEM,                       ENOMEM_fs_alloc)                        \
        x(ENOMEM,                       ENOMEM_fs_name_alloc)                   \
index 8bd9bcdd27f738a7a2f0d2ac831f0c77fdf20aa3..ff664fd0d8ef80e8b4816d7c430e87d41759b498 100644 (file)
@@ -13,7 +13,7 @@
 
 int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
                                     loff_t start, u64 end,
-                                    int fgp_flags, gfp_t gfp,
+                                    fgf_t fgp_flags, gfp_t gfp,
                                     folios *fs)
 {
        struct folio *f;
index a2222ad586e9e7530728507516abc33da4b0c128..27f712ae37a68209275cc3b2955a542314e80e68 100644 (file)
@@ -7,7 +7,7 @@
 typedef DARRAY(struct folio *) folios;
 
 int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
-                                    u64, int, gfp_t, folios *);
+                                    u64, fgf_t, gfp_t, folios *);
 int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
 
 /*
index 8dbc848f25b878d065d2b74949c46ad13b6dda27..f76d403ccb766d479b280c8faae33cd98f001548 100644 (file)
@@ -93,7 +93,7 @@ retry:
                                BTREE_ITER_INTENT) ?:
                (set ? set(trans, inode, &inode_u, p) : 0) ?:
                bch2_inode_write(trans, &iter, &inode_u) ?:
-               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 
        /*
         * the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -452,7 +452,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
 
        ret = commit_do(trans, NULL, NULL,
-                       BTREE_INSERT_NOFAIL,
+                       BCH_TRANS_COMMIT_no_enospc,
                bch2_unlink_trans(trans,
                                  inode_inum(dir), &dir_u,
                                  &inode_u, &dentry->d_name,
@@ -717,7 +717,7 @@ retry:
 
        ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL);
+                                 BCH_TRANS_COMMIT_no_enospc);
 btree_err:
        bch2_trans_iter_exit(trans, &inode_iter);
 
@@ -1922,10 +1922,7 @@ out:
        return dget(sb->s_root);
 
 err_put_super:
-       sb->s_fs_info = NULL;
-       c->vfs_sb = NULL;
        deactivate_locked_super(sb);
-       bch2_fs_stop(c);
        return ERR_PTR(bch2_err_class(ret));
 }
 
@@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
 {
        struct bch_fs *c = sb->s_fs_info;
 
-       if (c)
-               c->vfs_sb = NULL;
        generic_shutdown_super(sb);
-       if (c)
-               bch2_fs_free(c);
+       bch2_fs_free(c);
 }
 
 static struct file_system_type bcache_fs_type = {
index 9f3e9bd3d767a75fb1a0734c0413193a671f3206..127310d310fb9ca2bf4a550b72cf8eb031b3548b 100644 (file)
@@ -208,8 +208,8 @@ static int fsck_write_inode(struct btree_trans *trans,
                            u32 snapshot)
 {
        int ret = commit_do(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW,
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_lazy_rw,
                                  __write_inode(trans, inode, snapshot));
        if (ret)
                bch_err_fn(trans->c, ret);
@@ -354,8 +354,8 @@ static int reattach_inode(struct btree_trans *trans,
                          u32 inode_snapshot)
 {
        int ret = commit_do(trans, NULL, NULL,
-                                 BTREE_INSERT_LAZY_RW|
-                                 BTREE_INSERT_NOFAIL,
+                                 BCH_TRANS_COMMIT_lazy_rw|
+                                 BCH_TRANS_COMMIT_no_enospc,
                        __reattach_inode(trans, inode, inode_snapshot));
        bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
        return ret;
@@ -757,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans,
                                       BCH_HASH_SET_MUST_CREATE,
                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW);
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_lazy_rw);
 }
 
 static int hash_check_key(struct btree_trans *trans,
@@ -992,7 +992,7 @@ int bch2_check_inodes(struct bch_fs *c)
        ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
                        POS_MIN,
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_inode(trans, &iter, k, &prev, &s, full));
 
        snapshots_seen_exit(&s);
@@ -1226,7 +1226,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
                                k1, k2) ?:
                        bch2_trans_commit(trans, &res, NULL,
-                               BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
                bch2_disk_reservation_put(c, &res);
 
                if (ret)
@@ -1465,7 +1465,7 @@ int bch2_check_extents(struct bch_fs *c)
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                        &res, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
                bch2_disk_reservation_put(c, &res);
                check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
                check_extent_overbig(trans, &iter, k);
@@ -1494,7 +1494,7 @@ int bch2_check_indirect_extents(struct bch_fs *c)
                        POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        &res, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
                bch2_disk_reservation_put(c, &res);
                check_extent_overbig(trans, &iter, k);
        }));
@@ -1854,7 +1854,7 @@ int bch2_check_dirents(struct bch_fs *c)
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
 
        bch2_trans_put(trans);
@@ -1918,7 +1918,7 @@ int bch2_check_xattrs(struct bch_fs *c)
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_xattr(trans, &iter, k, &hash_info, &inode)));
        bch_err_fn(c, ret);
        return ret;
@@ -1949,8 +1949,8 @@ static int check_root_trans(struct btree_trans *trans)
                root_subvol.v.snapshot  = cpu_to_le32(snapshot);
                root_subvol.v.inode     = cpu_to_le64(inum);
                ret = commit_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
+                                     BCH_TRANS_COMMIT_no_enospc|
+                                     BCH_TRANS_COMMIT_lazy_rw,
                        bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
                                            &root_subvol.k_i, 0));
                bch_err_msg(c, ret, "writing root subvol");
@@ -1986,8 +1986,8 @@ int bch2_check_root(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_do(c, NULL, NULL,
-                            BTREE_INSERT_NOFAIL|
-                            BTREE_INSERT_LAZY_RW,
+                            BCH_TRANS_COMMIT_no_enospc|
+                            BCH_TRANS_COMMIT_lazy_rw,
                check_root_trans(trans));
        bch_err_fn(c, ret);
        return ret;
@@ -2116,8 +2116,8 @@ static int check_path(struct btree_trans *trans,
                                return 0;
 
                        ret = commit_do(trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL|
-                                             BTREE_INSERT_LAZY_RW,
+                                             BCH_TRANS_COMMIT_no_enospc|
+                                             BCH_TRANS_COMMIT_lazy_rw,
                                        remove_backpointer(trans, inode));
                        if (ret) {
                                bch_err(c, "error removing dirent: %i", ret);
@@ -2398,7 +2398,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
                                POS(0, range_start),
                                BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                               NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                        check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
        if (ret < 0) {
                bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2483,7 +2483,7 @@ int bch2_fix_reflink_p(struct bch_fs *c)
                                BTREE_ID_extents, POS_MIN,
                                BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
                                BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
                        fix_reflink_p_key(trans, &iter, k)));
        bch_err_fn(c, ret);
        return ret;
index ef58e2927aecb58f1263f2e9ec83902bde6b7d99..1baf8b7fdccb9e77f61a8784dce4657f85cd58f2 100644 (file)
@@ -830,7 +830,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
                ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
                      bch2_trans_commit(trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
+                                       BCH_TRANS_COMMIT_no_enospc);
 err:
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        break;
@@ -893,7 +893,7 @@ retry:
 
        ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_no_enospc);
 err:
        bch2_trans_iter_exit(trans, &iter);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1057,7 +1057,7 @@ retry:
 
        ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_no_enospc);
 err:
        bch2_trans_iter_exit(trans, &iter);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1091,7 +1091,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 
        ret = bch2_inode_unpack(k, &inode);
        if (ret)
-               goto err;
+               goto out;
 
        if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
                        deleted_inode_is_dir,
@@ -1109,38 +1109,45 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
            !fsck_err(c,
                      deleted_inode_but_clean,
                      "filesystem marked as clean but have deleted inode %llu:%u",
-                     pos.offset, pos.snapshot))
-               return 0;
+                     pos.offset, pos.snapshot)) {
+               ret = 0;
+               goto out;
+       }
 
        if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
                struct bpos new_min_pos;
 
                ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
                if (ret)
-                       goto err;
+                       goto out;
 
                inode.bi_flags &= ~BCH_INODE_unlinked;
 
                ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
-                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+                       bch2_trans_commit(trans, NULL, NULL,
+                                         BCH_TRANS_COMMIT_no_enospc|
+                                         BCH_TRANS_COMMIT_lazy_rw);
                bch_err_msg(c, ret, "clearing inode unlinked flag");
                if (ret)
-                       return ret;
+                       goto out;
 
                /*
                 * We'll need another write buffer flush to pick up the new
                 * unlinked inodes in the snapshot leaves:
                 */
                *need_another_pass = true;
-               return 0;
+               goto out;
        }
 
-       return 1;
-err:
+       ret = 1;
+out:
 fsck_err:
+       bch2_trans_iter_exit(trans, &inode_iter);
        return ret;
 delete:
-       return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+       ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+       goto out;
 }
 
 int bch2_delete_dead_inodes(struct bch_fs *c)
index bebc11444ef5ec598ef83c475716ea789b33bf69..eab0c8c577856aca13b002646e15574272792d80 100644 (file)
@@ -256,7 +256,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
        u64 new_i_size = le64_to_cpu(op->v.new_i_size);
        int ret;
 
-       ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+       ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                        truncate_set_isize(trans, inum, new_i_size));
        if (ret)
                goto err;
@@ -378,7 +378,7 @@ case LOGGED_OP_FINSERT_start:
        op->v.state = LOGGED_OP_FINSERT_shift_extents;
 
        if (insert) {
-               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                                adjust_i_size(trans, inum, src_offset, len) ?:
                                bch2_logged_op_update(trans, &op->k_i));
                if (ret)
@@ -390,7 +390,7 @@ case LOGGED_OP_FINSERT_start:
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto err;
 
-               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                                bch2_logged_op_update(trans, &op->k_i));
        }
 
@@ -455,7 +455,7 @@ case LOGGED_OP_FINSERT_shift_extents:
                        bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
                        bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
                        bch2_logged_op_update(trans, &op->k_i) ?:
-                       bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+                       bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
 btree_err:
                bch2_disk_reservation_put(c, &disk_res);
 
@@ -470,12 +470,12 @@ btree_err:
        op->v.state = LOGGED_OP_FINSERT_finish;
 
        if (!insert) {
-               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                                adjust_i_size(trans, inum, src_offset, shift) ?:
                                bch2_logged_op_update(trans, &op->k_i));
        } else {
                /* We need an inode update to update bi_journal_seq for fsync: */
-               ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                                adjust_i_size(trans, inum, 0, 0) ?:
                                bch2_logged_op_update(trans, &op->k_i));
        }
index ae36fc485f5f1e018cddb728ed244d84a1459d05..b833409c7865b766c79ffeaa4d7b10910be5095e 100644 (file)
@@ -526,7 +526,7 @@ out:
 
 static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 {
-       bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+       bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                      __bch2_rbio_narrow_crcs(trans, rbio));
 }
 
@@ -1025,7 +1025,7 @@ get_bio:
                trans->notrace_relock_fail = true;
        } else {
                /* Attempting reconstruct read: */
-               if (bch2_ec_read_extent(c, rbio)) {
+               if (bch2_ec_read_extent(trans, rbio)) {
                        bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
                        goto out;
                }
index fbfc42ff08036a17efec5fe573fede3335cce24d..97f7a4b7fdecaaf048839d7e989ae84bc93edf52 100644 (file)
@@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_i *k;
        struct bkey_i_inode_v3 *inode;
+       /*
+        * Crazy performance optimization:
+        * Every extent update needs to also update the inode: the inode trigger
+        * will set bi->journal_seq to the journal sequence number of this
+        * transaction - for fsync.
+        *
+        * But if that's the only reason we're updating the inode (we're not
+        * updating bi_size or bi_sectors), then we don't need the inode update
+        * to be journalled - if we crash, the bi_journal_seq update will be
+        * lost, but that's fine.
+        */
        unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
        int ret;
 
@@ -305,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
                                                  i_sectors_delta) ?:
                bch2_trans_update(trans, iter, k, 0) ?:
                bch2_trans_commit(trans, disk_res, NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc);
        if (unlikely(ret))
                return ret;
 
@@ -1165,7 +1176,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
                ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
                                     bkey_start_pos(&orig->k), orig->k.p,
                                     BTREE_ITER_INTENT, k,
-                                    NULL, NULL, BTREE_INSERT_NOFAIL, ({
+                                    NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
                        bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
                }));
 
index 5b5d69f2316b216746c0c08db2346c2c8c95ff16..7d448136434bd8f2b63674298387e84c214753bb 100644 (file)
@@ -361,11 +361,6 @@ static int journal_entry_open(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       if (j->res_get_blocked_start)
-               bch2_time_stats_update(j->blocked_time,
-                                      j->res_get_blocked_start);
-       j->res_get_blocked_start = 0;
-
        mod_delayed_work(c->io_complete_wq,
                         &j->write_work,
                         msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -465,15 +460,12 @@ retry:
        __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
        ret = journal_entry_open(j);
 
-       if (ret == JOURNAL_ERR_max_in_flight)
+       if (ret == JOURNAL_ERR_max_in_flight) {
+               track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+                                  &j->max_in_flight_start, true);
                trace_and_count(c, journal_entry_full, c);
-unlock:
-       if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
-           !j->res_get_blocked_start) {
-               j->res_get_blocked_start = local_clock() ?: 1;
-               trace_and_count(c, journal_full, c);
        }
-
+unlock:
        can_discard = j->can_discard;
        spin_unlock(&j->lock);
 
@@ -526,36 +518,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
        return ret;
 }
 
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
-                                    struct journal_preres *res,
-                                    unsigned new_u64s,
-                                    unsigned flags)
-{
-       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
-       if (!ret && mutex_trylock(&j->reclaim_lock)) {
-               bch2_journal_reclaim(j);
-               mutex_unlock(&j->reclaim_lock);
-       }
-
-       return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
-                             struct journal_preres *res,
-                             unsigned new_u64s,
-                             unsigned flags)
-{
-       int ret;
-
-       closure_wait_event(&j->preres_wait,
-                  (ret = bch2_journal_error(j)) ||
-                  journal_preres_available(j, res, new_u64s, flags));
-       return ret;
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -1290,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        union journal_res_state s;
        struct bch_dev *ca;
        unsigned long now = jiffies;
+       u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
        u64 seq;
        unsigned i;
 
@@ -1303,21 +1266,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        prt_printf(out, "dirty journal entries:\t%llu/%llu\n",  fifo_used(&j->pin), j->pin.size);
        prt_printf(out, "seq:\t\t\t%llu\n",                     journal_cur_seq(j));
        prt_printf(out, "seq_ondisk:\t\t%llu\n",                j->seq_ondisk);
-       prt_printf(out, "last_seq:\t\t%llu\n",          journal_last_seq(j));
+       prt_printf(out, "last_seq:\t\t%llu\n",                  journal_last_seq(j));
        prt_printf(out, "last_seq_ondisk:\t%llu\n",             j->last_seq_ondisk);
-       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",  j->flushed_seq_ondisk);
-       prt_printf(out, "prereserved:\t\t%u/%u\n",              j->prereserved.reserved, j->prereserved.remaining);
-       prt_printf(out, "watermark:\t\t%s\n",           bch2_watermarks[j->watermark]);
-       prt_printf(out, "each entry reserved:\t%u\n",   j->entry_u64s_reserved);
+       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",          j->flushed_seq_ondisk);
+       prt_printf(out, "watermark:\t\t%s\n",                   bch2_watermarks[j->watermark]);
+       prt_printf(out, "each entry reserved:\t%u\n",           j->entry_u64s_reserved);
        prt_printf(out, "nr flush writes:\t%llu\n",             j->nr_flush_writes);
-       prt_printf(out, "nr noflush writes:\t%llu\n",   j->nr_noflush_writes);
-       prt_printf(out, "nr direct reclaim:\t%llu\n",   j->nr_direct_reclaim);
+       prt_printf(out, "nr noflush writes:\t%llu\n",           j->nr_noflush_writes);
+       prt_printf(out, "average write size:\t");
+       prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+       prt_newline(out);
+       prt_printf(out, "nr direct reclaim:\t%llu\n",           j->nr_direct_reclaim);
        prt_printf(out, "nr background reclaim:\t%llu\n",       j->nr_background_reclaim);
        prt_printf(out, "reclaim kicked:\t\t%u\n",              j->reclaim_kicked);
-       prt_printf(out, "reclaim runs in:\t%u ms\n",    time_after(j->next_reclaim, now)
+       prt_printf(out, "reclaim runs in:\t%u ms\n",            time_after(j->next_reclaim, now)
               ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-       prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
-       prt_printf(out, "current entry error:\t%s\n",   bch2_journal_errors[j->cur_entry_error]);
+       prt_printf(out, "current entry sectors:\t%u\n",         j->cur_entry_sectors);
+       prt_printf(out, "current entry error:\t%s\n",           bch2_journal_errors[j->cur_entry_error]);
        prt_printf(out, "current entry:\t\t");
 
        switch (s.cur_entry_offset) {
index 011711e99c8d825ec968cf513f82c08a66ecabc5..c85d01cf49484984d08d20a2159f84b2506f96a1 100644 (file)
@@ -395,104 +395,6 @@ out:
        return 0;
 }
 
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
-       union journal_preres_state s = READ_ONCE(j->prereserved);
-       unsigned watermark = BCH_WATERMARK_stripe;
-
-       if (fifo_free(&j->pin) < j->pin.size / 4)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (fifo_free(&j->pin) < j->pin.size / 8)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (s.reserved > s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (!s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (watermark == j->watermark)
-               return;
-
-       swap(watermark, j->watermark);
-       if (watermark > j->watermark)
-               journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
-                                          struct journal_preres *res)
-{
-       union journal_preres_state s = { .reserved = res->u64s };
-
-       if (!res->u64s)
-               return;
-
-       s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
-       res->u64s = 0;
-
-       if (unlikely(s.waiting)) {
-               clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
-                         (unsigned long *) &j->prereserved.v);
-               closure_wake_up(&j->preres_wait);
-       }
-
-       if (s.reserved <= s.remaining && j->watermark)
-               journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
-                       struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
-                                              struct journal_preres *res,
-                                              unsigned new_u64s,
-                                              unsigned flags,
-                                              bool set_waiting)
-{
-       int d = new_u64s - res->u64s;
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       int ret;
-
-       do {
-               old.v = new.v = v;
-               ret = 0;
-
-               if (watermark == BCH_WATERMARK_reclaim ||
-                   new.reserved + d < new.remaining) {
-                       new.reserved += d;
-                       ret = 1;
-               } else if (set_waiting && !new.waiting)
-                       new.waiting = true;
-               else
-                       return 0;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
-
-       if (ret)
-               res->u64s += d;
-       return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
-                                         struct journal_preres *res,
-                                         unsigned new_u64s,
-                                         unsigned flags)
-{
-       if (new_u64s <= res->u64s)
-               return 0;
-
-       if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
-               return 0;
-
-       if (flags & JOURNAL_RES_GET_NONBLOCK)
-               return -BCH_ERR_journal_preres_get_blocked;
-
-       return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
index 65878542940d924e9d6656ff5590a14fe8a70b14..109c1157eba1d0c18aa510b94ac134356324e8af 100644 (file)
@@ -1079,6 +1079,12 @@ found:
 
        if (ja->bucket_seq[ja->cur_idx] &&
            ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+               /*
+                * Debug code for ZNS support, where we (probably) want to be
+                * correlated where we stopped in the journal to the zone write
+                * points:
+                */
                bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
                bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
                for (i = 0; i < 3; i++) {
@@ -1086,6 +1092,7 @@ found:
 
                        bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
                }
+#endif
                ja->sectors_free = 0;
        }
 
@@ -1585,6 +1592,9 @@ static void journal_write_done(struct closure *cl)
 
        bch2_journal_space_available(j);
 
+       track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+                          &j->max_in_flight_start, false);
+
        closure_wake_up(&w->wait);
        journal_wake(j);
 
@@ -1678,9 +1688,15 @@ static void do_journal_write(struct closure *cl)
        continue_at(cl, journal_write_done, c->io_complete_wq);
 }
 
-static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 {
-       struct jset_entry *i, *next, *prev = NULL;
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct jset_entry *start, *end, *i, *next, *prev = NULL;
+       struct jset *jset = w->data;
+       unsigned sectors, bytes, u64s;
+       bool validate_before_checksum = false;
+       unsigned long btree_roots_have = 0;
+       int ret;
 
        /*
         * Simple compaction, dropping empty jset_entries (from journal
@@ -1697,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
                if (!u64s)
                        continue;
 
-               if (i->type == BCH_JSET_ENTRY_btree_root)
+               /*
+                * New btree roots are set by journalling them; when the journal
+                * entry gets written we have to propagate them to
+                * c->btree_roots
+                *
+                * But, every journal entry we write has to contain all the
+                * btree roots (at least for now); so after we copy btree roots
+                * to c->btree_roots we have to get any missing btree roots and
+                * add them to this journal entry:
+                */
+               if (i->type == BCH_JSET_ENTRY_btree_root) {
                        bch2_journal_entry_to_btree_root(c, i);
+                       __set_bit(i->btree_id, &btree_roots_have);
+               }
 
                /* Can we merge with previous entry? */
                if (prev &&
@@ -1722,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
 
        prev = prev ? vstruct_next(prev) : jset->start;
        jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-void bch2_journal_write(struct closure *cl)
-{
-       struct journal *j = container_of(cl, struct journal, io);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       struct journal_buf *w = journal_last_unwritten_buf(j);
-       struct bch_replicas_padded replicas;
-       struct jset_entry *start, *end;
-       struct jset *jset;
-       struct bio *bio;
-       struct printbuf journal_debug_buf = PRINTBUF;
-       bool validate_before_checksum = false;
-       unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
-       int ret;
-
-       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
-       journal_buf_realloc(j, w);
-       jset = w->data;
-
-       j->write_start_time = local_clock();
-
-       spin_lock(&j->lock);
-
-       /*
-        * If the journal is in an error state - we did an emergency shutdown -
-        * we prefer to continue doing journal writes. We just mark them as
-        * noflush so they'll never be used, but they'll still be visible by the
-        * list_journal tool - this helps in debugging.
-        *
-        * There's a caveat: the first journal write after marking the
-        * superblock dirty must always be a flush write, because on startup
-        * from a clean shutdown we didn't necessarily read the journal and the
-        * new journal write might overwrite whatever was in the journal
-        * previously - we can't leave the journal without any flush writes in
-        * it.
-        *
-        * So if we're in an error state, and we're still starting up, we don't
-        * write anything at all.
-        */
-       if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
-           (bch2_journal_error(j) ||
-            w->noflush ||
-            (!w->must_flush &&
-             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
-               w->noflush = true;
-               SET_JSET_NO_FLUSH(jset, true);
-               jset->last_seq  = 0;
-               w->last_seq     = 0;
-
-               j->nr_noflush_writes++;
-       } else if (!bch2_journal_error(j)) {
-               j->last_flush_write = jiffies;
-               j->nr_flush_writes++;
-               clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
-       } else {
-               spin_unlock(&j->lock);
-               goto err;
-       }
-       spin_unlock(&j->lock);
-
-       /*
-        * New btree roots are set by journalling them; when the journal entry
-        * gets written we have to propagate them to c->btree_roots
-        *
-        * But, every journal entry we write has to contain all the btree roots
-        * (at least for now); so after we copy btree roots to c->btree_roots we
-        * have to get any missing btree roots and add them to this journal
-        * entry:
-        */
-
-       bch2_journal_entries_postprocess(c, jset);
 
        start = end = vstruct_last(jset);
 
-       end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+       end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
        bch2_journal_super_entries_add_common(c, &end,
                                le64_to_cpu(jset->seq));
@@ -1816,7 +1769,7 @@ void bch2_journal_write(struct closure *cl)
                bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
                                    vstruct_bytes(jset), w->sectors << 9,
                                    u64s, w->u64s_reserved, j->entry_u64s_reserved);
-               goto err;
+               return -EINVAL;
        }
 
        jset->magic             = cpu_to_le64(jset_magic(c));
@@ -1835,37 +1788,119 @@ void bch2_journal_write(struct closure *cl)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
-           jset_validate(c, NULL, jset, 0, WRITE))
-               goto err;
+           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+               return ret;
 
        ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                    jset->encrypted_start,
                    vstruct_end(jset) - (void *) jset->encrypted_start);
        if (bch2_fs_fatal_err_on(ret, c,
                        "error decrypting journal entry: %i", ret))
-               goto err;
+               return ret;
 
        jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
                                  journal_nonce(jset), jset);
 
        if (!validate_before_checksum &&
-           jset_validate(c, NULL, jset, 0, WRITE))
-               goto err;
+           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+               return ret;
 
        memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+       return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       int error = bch2_journal_error(j);
+
+       /*
+        * If the journal is in an error state - we did an emergency shutdown -
+        * we prefer to continue doing journal writes. We just mark them as
+        * noflush so they'll never be used, but they'll still be visible by the
+        * list_journal tool - this helps in debugging.
+        *
+        * There's a caveat: the first journal write after marking the
+        * superblock dirty must always be a flush write, because on startup
+        * from a clean shutdown we didn't necessarily read the journal and the
+        * new journal write might overwrite whatever was in the journal
+        * previously - we can't leave the journal without any flush writes in
+        * it.
+        *
+        * So if we're in an error state, and we're still starting up, we don't
+        * write anything at all.
+        */
+       if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+               return -EIO;
+
+       if (error ||
+           w->noflush ||
+           (!w->must_flush &&
+            (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+            test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+                    w->noflush = true;
+               SET_JSET_NO_FLUSH(w->data, true);
+               w->data->last_seq       = 0;
+               w->last_seq             = 0;
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+               clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+       }
+
+       return 0;
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct journal_buf *w = journal_last_unwritten_buf(j);
+       struct bch_replicas_padded replicas;
+       struct bio *bio;
+       struct printbuf journal_debug_buf = PRINTBUF;
+       unsigned i, nr_rw_members = 0;
+       int ret;
+
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+       j->write_start_time = local_clock();
 
-retry_alloc:
        spin_lock(&j->lock);
-       ret = journal_write_alloc(j, w);
+       ret = bch2_journal_write_pick_flush(j, w);
+       spin_unlock(&j->lock);
+       if (ret)
+               goto err;
+
+       journal_buf_realloc(j, w);
+
+       ret = bch2_journal_write_prep(j, w);
+       if (ret)
+               goto err;
+
+       j->entry_bytes_written += vstruct_bytes(w->data);
+
+       while (1) {
+               spin_lock(&j->lock);
+               ret = journal_write_alloc(j, w);
+               if (!ret || !j->can_discard)
+                       break;
 
-       if (ret && j->can_discard) {
                spin_unlock(&j->lock);
                bch2_journal_do_discards(j);
-               goto retry_alloc;
        }
 
-       if (ret)
+       if (ret) {
                __bch2_journal_debug_to_text(&journal_debug_buf, j);
+               spin_unlock(&j->lock);
+               bch_err(c, "Unable to allocate journal write:\n%s",
+                       journal_debug_buf.buf);
+               printbuf_exit(&journal_debug_buf);
+               goto err;
+       }
 
        /*
         * write is allocated, no longer need to account for it in
@@ -1880,13 +1915,6 @@ retry_alloc:
        bch2_journal_space_available(j);
        spin_unlock(&j->lock);
 
-       if (ret) {
-               bch_err(c, "Unable to allocate journal write:\n%s",
-                       journal_debug_buf.buf);
-               printbuf_exit(&journal_debug_buf);
-               goto err;
-       }
-
        w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
        if (c->opts.nochanges)
@@ -1908,7 +1936,7 @@ retry_alloc:
        if (ret)
                goto err;
 
-       if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+       if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
                for_each_rw_member(ca, c, i) {
                        percpu_ref_get(&ca->io_ref);
 
index 9a584aaaa2eba9abadc7f2016a20c70834e0610c..8fa05bedb7dff8f36084fafc37106fe9915579a0 100644 (file)
@@ -50,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
        return available;
 }
 
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j)
 {
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
-
-       do {
-               old.v = new.v = v;
-               new.remaining = u64s_remaining;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool low_on_space = j->space[journal_space_clean].total * 4 <=
+               j->space[journal_space_total].total;
+       bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+       unsigned watermark = low_on_space || low_on_pin
+               ? BCH_WATERMARK_reclaim
+               : BCH_WATERMARK_stripe;
+
+       if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+                              &j->low_on_space_start, low_on_space) ||
+           track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+                              &j->low_on_pin_start, low_on_pin))
+               trace_and_count(c, journal_full, c);
+
+       swap(watermark, j->watermark);
+       if (watermark > j->watermark)
+               journal_wake(j);
 }
 
 static struct journal_space
@@ -162,7 +171,6 @@ void bch2_journal_space_available(struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        unsigned clean, clean_ondisk, total;
-       s64 u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
        unsigned i, nr_online = 0, nr_devs_want;
@@ -222,16 +230,10 @@ void bch2_journal_space_available(struct journal *j)
        else
                clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-       u64s_remaining  = (u64) clean << 6;
-       u64s_remaining -= (u64) total << 3;
-       u64s_remaining = max(0LL, u64s_remaining);
-       u64s_remaining /= 4;
-       u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+       journal_set_watermark(j);
 out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
-       journal_set_remaining(j, u64s_remaining);
-       journal_set_watermark(j);
 
        if (!ret)
                journal_wake(j);
@@ -369,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
                return JOURNAL_PIN_other;
 }
 
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
                          struct journal_entry_pin *pin,
-                         journal_pin_flush_fn flush_fn)
+                         journal_pin_flush_fn flush_fn,
+                         enum journal_pin_type type)
+{
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+       /*
+        * flush_fn is how we identify journal pins in debugfs, so must always
+        * exist, even if it doesn't do anything:
+        */
+       BUG_ON(!flush_fn);
+
+       atomic_inc(&pin_list->count);
+       pin->seq        = seq;
+       pin->flush      = flush_fn;
+       list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+                          struct journal_entry_pin *dst,
+                          struct journal_entry_pin *src,
+                          journal_pin_flush_fn flush_fn)
 {
-       struct journal_entry_pin_list *pin_list;
        bool reclaim;
 
        spin_lock(&j->lock);
 
+       u64 seq = READ_ONCE(src->seq);
+
        if (seq < journal_last_seq(j)) {
                /*
                 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -389,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
                return;
        }
 
-       pin_list = journal_seq_pin(j, seq);
+       reclaim = __journal_pin_drop(j, dst);
 
-       reclaim = __journal_pin_drop(j, pin);
+       bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
-       atomic_inc(&pin_list->count);
-       pin->seq        = seq;
-       pin->flush      = flush_fn;
+       if (reclaim)
+               bch2_journal_reclaim_fast(j);
+       spin_unlock(&j->lock);
 
-       if (flush_fn)
-               list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
-       else
-               list_add(&pin->list, &pin_list->flushed);
+       /*
+        * If the journal is currently full,  we might want to call flush_fn
+        * immediately:
+        */
+       journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+                         struct journal_entry_pin *pin,
+                         journal_pin_flush_fn flush_fn)
+{
+       bool reclaim;
+
+       spin_lock(&j->lock);
+
+       BUG_ON(seq < journal_last_seq(j));
+
+       reclaim = __journal_pin_drop(j, pin);
+
+       bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
        if (reclaim)
                bch2_journal_reclaim_fast(j);
@@ -555,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j)
                /* Try to keep the journal at most half full: */
                nr_buckets = ja->nr / 2;
 
-               /* And include pre-reservations: */
-               nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
-                                          (ca->mi.bucket_size << 6) -
-                                          journal_entry_overhead(j));
-
                nr_buckets = min(nr_buckets, ja->nr);
 
                bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -638,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                               msecs_to_jiffies(c->opts.journal_reclaim_delay)))
                        min_nr = 1;
 
-               if (j->prereserved.reserved * 4 > j->prereserved.remaining)
-                       min_nr = 1;
-
-               if (fifo_free(&j->pin) <= 32)
+               if (j->watermark != BCH_WATERMARK_stripe)
                        min_nr = 1;
 
                if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -652,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                trace_and_count(c, journal_reclaim_start, c,
                                direct, kicked,
                                min_nr, min_key_cache,
-                               j->prereserved.reserved,
-                               j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
                                c->btree_cache.used,
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
@@ -805,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
+       /* time_stats this */
        bool did_work = false;
 
        if (!test_bit(JOURNAL_STARTED, &j->flags))
index 494d1a6eddb011fd5c0aa0b41676522949b12577..7b15d682a0f51d28c47f7d881edb1b08ca24d10c 100644 (file)
@@ -47,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
                bch2_journal_pin_set(j, seq, pin, flush_fn);
 }
 
-static inline void bch2_journal_pin_copy(struct journal *j,
-                                        struct journal_entry_pin *dst,
-                                        struct journal_entry_pin *src,
-                                        journal_pin_flush_fn flush_fn)
-{
-       /* Guard against racing with journal_pin_drop(src): */
-       u64 seq = READ_ONCE(src->seq);
-
-       if (seq)
-               bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+                          struct journal_entry_pin *,
+                          struct journal_entry_pin *,
+                          journal_pin_flush_fn);
 
 static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
                                           struct journal_entry_pin *pin,
index 42504e16acb6ccf261a6699b6d468cba7d26a776..2427cce64fed93388214c3de8b6446875eaf01b6 100644 (file)
@@ -76,14 +76,6 @@ struct journal_res {
        u64                     seq;
 };
 
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
-       unsigned                u64s;
-};
-
 union journal_res_state {
        struct {
                atomic64_t      counter;
@@ -104,22 +96,6 @@ union journal_res_state {
        };
 };
 
-union journal_preres_state {
-       struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
-       struct {
-               u64             waiting:1,
-                               reserved:31,
-                               remaining:32;
-       };
-};
-
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
        union journal_res_state reservations;
        enum bch_watermark      watermark;
 
-       union journal_preres_state prereserved;
-
        } __aligned(SMP_CACHE_BYTES);
 
        unsigned long           flags;
@@ -288,15 +262,18 @@ struct journal {
 
        unsigned long           last_flush_write;
 
-       u64                     res_get_blocked_start;
        u64                     write_start_time;
 
        u64                     nr_flush_writes;
        u64                     nr_noflush_writes;
+       u64                     entry_bytes_written;
+
+       u64                     low_on_space_start;
+       u64                     low_on_pin_start;
+       u64                     max_in_flight_start;
 
        struct bch2_time_stats  *flush_write_time;
        struct bch2_time_stats  *noflush_write_time;
-       struct bch2_time_stats  *blocked_time;
        struct bch2_time_stats  *flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
index 8640f7dee0de95d8a15439b587a7455c0171f9c4..9a76a9aab5c33b54b16bae1412172cd3ca557a5e 100644 (file)
@@ -85,13 +85,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 
 int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 {
-       return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+       return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                         __bch2_logged_op_start(trans, k));
 }
 
 void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
 {
-       int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                            bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
        /*
         * This needs to be a fatal error because we've left an unfinished
index a5cc0ed195d6324d1f49718d5860b24045579f1b..e6d081c0592c81bb1db26db6c3d08aafce8bc7d9 100644 (file)
@@ -155,7 +155,7 @@ int bch2_check_lrus(struct bch_fs *c)
        ret = bch2_trans_run(c,
                for_each_btree_key_commit(trans, iter,
                                BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
                        bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
        if (ret)
                bch_err_fn(c, ret);
index e3a51f6d6c9b25dcae89934eace9e68b038531de..8e5688d0a8ca6af79b9b98c11efdef88c67645e1 100644 (file)
@@ -90,7 +90,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
                ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                        bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
                if (ret)
                        break;
index ab749bf2fcbc551e68753857efdf008848d140b7..4f7d1758d8a97588a2e73a7397ea9880a356b414 100644 (file)
@@ -263,7 +263,7 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 
        return bch2_trans_relock(trans) ?:
                bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
 int bch2_move_extent(struct moving_context *ctxt,
index 0158c7aae6b066a75f0d08ebdc5152cd6b9246fb..0a0576326c5b2d433fcd4aace513379972f57152 100644 (file)
@@ -370,6 +370,7 @@ static int bch2_copygc_thread(void *arg)
                        if (min_member_capacity == U64_MAX)
                                min_member_capacity = 128 * 2048;
 
+                       bch2_trans_unlock_long(ctxt.trans);
                        bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
                                        MAX_SCHEDULE_TIMEOUT);
                }
index 3319190b8d9c330fde44ad959bc299aa00d2ba87..db2139c0545d789c95297fd68ea9292a26a9cdaa 100644 (file)
@@ -69,7 +69,7 @@ err:
 
 int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
 {
-       int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+       int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
                            __bch2_set_rebalance_needs_scan(trans, inum));
        rebalance_wakeup(c);
        return ret;
@@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
 
        extent_entry_drop(bkey_i_to_s(n),
                          (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
-       return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
 static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -273,7 +273,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
        r->state = BCH_REBALANCE_scanning;
 
        ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
-               commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                          bch2_clear_rebalance_needs_scan(trans, inum, cookie));
 
        bch2_move_stats_exit(&r->scan_stats, trans->c);
index 9c30500ce9200af8be8f71a50f5fa02c356e4400..130274b195e21621c391ad89d965440cf077816c 100644 (file)
@@ -98,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
        unsigned update_flags = BTREE_TRIGGER_NORUN;
        int ret;
 
+       if (k->overwritten)
+               return 0;
+
+       trans->journal_res.seq = k->journal_seq;
+
        /*
         * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
         * keep the key cache coherent with the underlying btree. Nothing
@@ -139,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 static int bch2_journal_replay(struct bch_fs *c)
 {
        struct journal_keys *keys = &c->journal_keys;
-       struct journal_key **keys_sorted, *k;
+       DARRAY(struct journal_key *) keys_sorted = { 0 };
+       struct journal_key **kp;
        struct journal *j = &c->journal;
        u64 start_seq   = c->journal_replay_seq_start;
        u64 end_seq     = c->journal_replay_seq_start;
-       size_t i;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
 
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-       keys->gap = keys->nr;
-
-       keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
-       if (!keys_sorted)
-               return -BCH_ERR_ENOMEM_journal_replay;
-
-       for (i = 0; i < keys->nr; i++)
-               keys_sorted[i] = &keys->d[i];
-
-       sort(keys_sorted, keys->nr,
-            sizeof(keys_sorted[0]),
-            journal_sort_seq_cmp, NULL);
-
        if (keys->nr) {
                ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
                                           keys->nr, start_seq, end_seq);
@@ -167,27 +159,61 @@ static int bch2_journal_replay(struct bch_fs *c)
                        goto err;
        }
 
-       for (i = 0; i < keys->nr; i++) {
-               k = keys_sorted[i];
+       /*
+        * First, attempt to replay keys in sorted order. This is more
+        * efficient, but some might fail if that would cause a journal
+        * deadlock.
+        */
+       for (size_t i = 0; i < keys->nr; i++) {
+               cond_resched();
+
+               struct journal_key *k = keys->d + i;
+
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_journal_reclaim|
+                               (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+                            bch2_journal_replay_key(trans, k));
+               BUG_ON(!ret && !k->overwritten);
+               if (ret) {
+                       ret = darray_push(&keys_sorted, k);
+                       if (ret)
+                               goto err;
+               }
+       }
 
+       /*
+        * Now, replay any remaining keys in the order in which they appear in
+        * the journal, unpinning those journal entries as we go:
+        */
+       sort(keys_sorted.data, keys_sorted.nr,
+            sizeof(keys_sorted.data[0]),
+            journal_sort_seq_cmp, NULL);
+
+       darray_for_each(keys_sorted, kp) {
                cond_resched();
 
+               struct journal_key *k = *kp;
+
                replay_now_at(j, k->journal_seq);
 
-               ret = bch2_trans_do(c, NULL, NULL,
-                                   BTREE_INSERT_LAZY_RW|
-                                   BTREE_INSERT_NOFAIL|
-                                   (!k->allocated
-                                    ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
-                                    : 0),
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               (!k->allocated
+                                ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+                                : 0),
                             bch2_journal_replay_key(trans, k));
-               if (ret) {
-                       bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
-                               bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+               bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+                           bch2_btree_id_str(k->btree_id), k->level);
+               if (ret)
                        goto err;
-               }
+
+               BUG_ON(!k->overwritten);
        }
 
+       bch2_trans_put(trans);
+       trans = NULL;
+
        replay_now_at(j, j->replay_journal_seq_end);
        j->replay_journal_seq = 0;
 
@@ -198,10 +224,10 @@ static int bch2_journal_replay(struct bch_fs *c)
        if (keys->nr && !ret)
                bch2_journal_log_msg(c, "journal replay finished");
 err:
-       kvfree(keys_sorted);
-
-       if (ret)
-               bch_err_fn(c, ret);
+       if (trans)
+               bch2_trans_put(trans);
+       darray_exit(&keys_sorted);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -468,7 +494,7 @@ err:
 noinline_for_stack
 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
-       int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+       int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
                                __bch2_fs_upgrade_for_subvolumes(trans));
        if (ret)
                bch_err_fn(c, ret);
@@ -489,7 +515,19 @@ static int bch2_check_allocations(struct bch_fs *c)
 
 static int bch2_set_may_go_rw(struct bch_fs *c)
 {
+       struct journal_keys *keys = &c->journal_keys;
+
+       /*
+        * After we go RW, the journal keys buffer can't be modified (except for
+        * setting journal_key->overwritten: it will be accessed by multiple
+        * threads
+        */
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
        set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+       if (keys->nr)
+               return bch2_fs_read_write_early(c);
        return 0;
 }
 
index 6e1bfe9feb59e4abe96e1dc74b30196fa5766f48..07ddf3e85ee454577f4ef6354e25ab7a89671161 100644 (file)
@@ -390,7 +390,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        inode_u.bi_size = new_i_size;
                        ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
                                bch2_trans_commit(trans, NULL, NULL,
-                                                 BTREE_INSERT_NOFAIL);
+                                                 BCH_TRANS_COMMIT_no_enospc);
                }
 
                bch2_trans_iter_exit(trans, &inode_iter);
index 9b6cc86d264a1f9e2412a552a0547c6d3f4d4f28..e151ada1c8bd2db23e31bc1f6f027815585e8ab2 100644 (file)
@@ -376,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
        entry = sb_clean->start;
        bch2_journal_super_entries_add_common(c, &entry, 0);
-       entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+       entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
        BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
        memset(entry, 0,
index 9215d414b5253c81aa9cd6ee48eb214afa354164..f0930ab7f036eb30fe5d40708f4b82a1e68907f2 100644 (file)
@@ -70,7 +70,7 @@ static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
                prt_tab(out);
                prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
                prt_tab(out);
-               bch2_prt_date_seconds(out, le64_to_cpu(e->entries[i].last_error_time));
+               bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
                prt_newline(out);
        }
 }
index 6a7e20de971c4bdedaec5d5d672c9dce38df32a2..bed0f857fe5b7627639ee24202dba1002910eee7 100644 (file)
@@ -230,7 +230,7 @@ static void member_to_text(struct printbuf *out,
        prt_printf(out, "Last mount:");
        prt_tab(out);
        if (m.last_mount)
-               bch2_prt_date_seconds(out, le64_to_cpu(m.last_mount));
+               bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
        else
                prt_printf(out, "(never)");
        prt_newline(out);
index e9af77b384c76c694194c53b348706e354df9a22..b23550b4409814baa9010d637cbed885df6a4482 100644 (file)
@@ -590,7 +590,7 @@ int bch2_check_snapshot_trees(struct bch_fs *c)
                for_each_btree_key_commit(trans, iter,
                        BTREE_ID_snapshot_trees, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_snapshot_tree(trans, &iter, k)));
 
        if (ret)
@@ -868,7 +868,7 @@ int bch2_check_snapshots(struct bch_fs *c)
                for_each_btree_key_reverse_commit(trans, iter,
                        BTREE_ID_snapshots, POS_MAX,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_snapshot(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
@@ -959,7 +959,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
                                        parent_id, id))
                        goto err;
 
-               parent->v.children[i] = le32_to_cpu(child_id);
+               parent->v.children[i] = cpu_to_le32(child_id);
 
                normalize_snapshot_child_pointers(&parent->v);
        }
@@ -1449,12 +1449,12 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                ret = for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               &res, NULL, BTREE_INSERT_NOFAIL,
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
                        snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
                      for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               &res, NULL, BTREE_INSERT_NOFAIL,
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
                        move_key_to_correct_snapshot(trans, &iter, k));
 
                bch2_disk_reservation_put(c, &res);
@@ -1489,7 +1489,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
         */
        ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
                                  BTREE_ITER_INTENT, k,
-                                 NULL, NULL, BTREE_INSERT_NOFAIL,
+                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
        if (ret)
                goto err_create_lock;
index ae21a8cca1b49d4d9bbfe2b38a330c78b9abc023..89fdb7c21134ebbb6c145a88ed5b1943ab54588a 100644 (file)
 #include <crypto/hash.h>
 #include <crypto/sha2.h>
 
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+       __BCH_HASH_SET_MUST_CREATE,
+       __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE       (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE      (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
 static inline enum bch_str_hash_type
 bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
@@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
                           const struct bch_hash_info *info,
                           subvol_inum inum, u32 snapshot,
                           struct bkey_i *insert,
-                          int flags,
+                          bch_str_hash_flags_t str_hash_flags,
                           int update_flags)
 {
        struct btree_iter iter, slot = { NULL };
@@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
                }
 
                if (!slot.path &&
-                   !(flags & BCH_HASH_SET_MUST_REPLACE))
+                   !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
                        bch2_trans_copy_iter(&slot, &iter);
 
                if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +297,16 @@ found:
        found = true;
 not_found:
 
-       if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+       if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
                ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
-       } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+       } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
                ret = -EEXIST;
        } else {
                if (!found && slot.path)
                        swap(iter, slot);
 
                insert->k.p = iter.pos;
-               ret = bch2_trans_update(trans, &iter, insert, 0);
+               ret = bch2_trans_update(trans, &iter, insert, update_flags);
        }
 
        goto out;
@@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans,
                  const struct bch_hash_desc desc,
                  const struct bch_hash_info *info,
                  subvol_inum inum,
-                 struct bkey_i *insert, int flags)
+                 struct bkey_i *insert,
+                 bch_str_hash_flags_t str_hash_flags)
 {
        u32 snapshot;
        int ret;
@@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans,
        insert->k.p.inode = inum.inum;
 
        return bch2_hash_set_snapshot(trans, desc, info, inum,
-                                     snapshot, insert, flags, 0);
+                                     snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline
index fccd25aa32426a4233882a9d97cc214cba9dc6f5..1cbf9e3a09ecf67bbcc59951ce4df45e48f87c1c 100644 (file)
@@ -89,7 +89,7 @@ int bch2_check_subvols(struct bch_fs *c)
        ret = bch2_trans_run(c,
                for_each_btree_key_commit(trans, iter,
                        BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_subvol(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
@@ -219,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
                                   BTREE_ITER_CACHED, &s)) ?:
                for_each_btree_key_commit(trans, iter,
                                BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                        bch2_subvolume_reparent(trans, &iter, k,
                                        subvolid_to_delete, le32_to_cpu(s.parent)));
 }
@@ -256,7 +256,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
        return bch2_subvolumes_reparent(trans, subvolid) ?:
-               commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                          __bch2_subvolume_delete(trans, subvolid));
 }
 
index 86833445af205643b81bd08b3b204005c7bee071..2d2e66a4e4681ee5ba6ba18666d135ab961a2cbf 100644 (file)
@@ -20,7 +20,7 @@ struct snapshot_t {
 };
 
 struct snapshot_table {
-       struct snapshot_t       s[0];
+       DECLARE_FLEX_ARRAY(struct snapshot_t, s);
 };
 
 typedef struct {
index 9b9f36af6bd31d47aa84404a6f85e3d323c0a1d9..f4cad903f4d69da7776825f50bf561a1980a02a0 100644 (file)
@@ -1183,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
        prt_printf(out, "Created:");
        prt_tab(out);
        if (sb->time_base_lo)
-               bch2_prt_date_seconds(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+               bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
        else
                prt_printf(out, "(not set)");
        prt_newline(out);
index 24672bb31cbe9c479964dffe1d1b979dd66013c7..bb9451082e872ca6752e085ca1c884563821a157 100644 (file)
@@ -641,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c)
        ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
            kobject_add(&c->internal, &c->kobj, "internal") ?:
            kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
            kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
            bch2_opts_create_sysfs_files(&c->opts_dir);
        if (ret) {
@@ -750,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        c->journal.flush_write_time     = &c->times[BCH_TIME_journal_flush_write];
        c->journal.noflush_write_time   = &c->times[BCH_TIME_journal_noflush_write];
-       c->journal.blocked_time         = &c->times[BCH_TIME_blocked_journal];
        c->journal.flush_seq_time       = &c->times[BCH_TIME_journal_flush_seq];
 
        bch2_fs_btree_cache_init_early(&c->btree_cache);
index 893304a1f06e6ea03df55020cf7be26f349d8cfe..7857671159b491235a0bcdfe19f9b34a316a0126 100644 (file)
@@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
 TRACE_EVENT(journal_reclaim_start,
        TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
                 u64 min_nr, u64 min_key_cache,
-                u64 prereserved, u64 prereserved_total,
                 u64 btree_cache_dirty, u64 btree_cache_total,
                 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+       TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
                btree_cache_dirty, btree_cache_total,
                btree_key_cache_dirty, btree_key_cache_total),
 
@@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
                __field(bool,           kicked                  )
                __field(u64,            min_nr                  )
                __field(u64,            min_key_cache           )
-               __field(u64,            prereserved             )
-               __field(u64,            prereserved_total       )
                __field(u64,            btree_cache_dirty       )
                __field(u64,            btree_cache_total       )
                __field(u64,            btree_key_cache_dirty   )
@@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->kicked                 = kicked;
                __entry->min_nr                 = min_nr;
                __entry->min_key_cache          = min_key_cache;
-               __entry->prereserved            = prereserved;
-               __entry->prereserved_total      = prereserved_total;
                __entry->btree_cache_dirty      = btree_cache_dirty;
                __entry->btree_cache_total      = btree_cache_total;
                __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->direct,
                  __entry->kicked,
                  __entry->min_nr,
                  __entry->min_key_cache,
-                 __entry->prereserved,
-                 __entry->prereserved_total,
                  __entry->btree_cache_dirty,
                  __entry->btree_cache_total,
                  __entry->btree_key_cache_dirty,
index 7ba5df4e828608cb8c4e00597bdb460d2b78517e..2ff9cdfb006c920172e7727e5f9c406ee218fd99 100644 (file)
@@ -315,6 +315,57 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
        return ret;
 }
 
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+       time_t t = sec;
+       char buf[64];
+       ctime_r(&t, buf);
+       prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+       char buf[64];
+       snprintf(buf, sizeof(buf), "%ptT", &sec);
+       prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+       const char      *name;
+       u64             nsecs;
+} time_units[] = {
+       { "ns",         1                },
+       { "us",         NSEC_PER_USEC    },
+       { "ms",         NSEC_PER_MSEC    },
+       { "s",          NSEC_PER_SEC     },
+       { "m",          (u64) NSEC_PER_SEC * 60},
+       { "h",          (u64) NSEC_PER_SEC * 3600},
+       { "eon",        U64_MAX          },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+       const struct time_unit *u;
+
+       for (u = time_units;
+            u + 1 < time_units + ARRAY_SIZE(time_units) &&
+            ns >= u[1].nsecs << 1;
+            u++)
+               ;
+
+       return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+       const struct time_unit *u = pick_time_units(ns);
+
+       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
 /* time stats: */
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -359,6 +410,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
                mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
                stats->max_duration = max(stats->max_duration, duration);
                stats->min_duration = min(stats->min_duration, duration);
+               stats->total_duration += duration;
                bch2_quantiles_update(&stats->quantiles, duration);
        }
 
@@ -372,20 +424,24 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
        }
 }
 
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+                                          struct bch2_time_stat_buffer *b)
+{
+       for (struct bch2_time_stat_buffer_entry *i = b->entries;
+            i < b->entries + ARRAY_SIZE(b->entries);
+            i++)
+               bch2_time_stats_update_one(stats, i->start, i->end);
+       b->nr = 0;
+}
+
 static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
                                                  struct bch2_time_stat_buffer *b)
 {
-       struct bch2_time_stat_buffer_entry *i;
        unsigned long flags;
 
        spin_lock_irqsave(&stats->lock, flags);
-       for (i = b->entries;
-            i < b->entries + ARRAY_SIZE(b->entries);
-            i++)
-               bch2_time_stats_update_one(stats, i->start, i->end);
+       __bch2_time_stats_clear_buffer(stats, b);
        spin_unlock_irqrestore(&stats->lock, flags);
-
-       b->nr = 0;
 }
 
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
@@ -423,40 +479,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
                preempt_enable();
        }
 }
-#endif
-
-static const struct time_unit {
-       const char      *name;
-       u64             nsecs;
-} time_units[] = {
-       { "ns",         1                },
-       { "us",         NSEC_PER_USEC    },
-       { "ms",         NSEC_PER_MSEC    },
-       { "s",          NSEC_PER_SEC     },
-       { "m",          (u64) NSEC_PER_SEC * 60},
-       { "h",          (u64) NSEC_PER_SEC * 3600},
-       { "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-       const struct time_unit *u;
-
-       for (u = time_units;
-            u + 1 < time_units + ARRAY_SIZE(time_units) &&
-            ns >= u[1].nsecs << 1;
-            u++)
-               ;
-
-       return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
-       const struct time_unit *u = pick_time_units(ns);
-
-       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
 
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
@@ -467,26 +489,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
        prt_printf(out, "%s", u->name);
 }
 
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
-{
-       time_t t = sec;
-       char buf[64];
-       ctime_r(&t, buf);
-       prt_str(out, buf);
-}
-#else
-void bch2_prt_date_seconds(struct printbuf *out, time64_t sec)
-{
-       char buf[64];
-       snprintf(buf, sizeof(buf), "%ptT", &sec);
-       prt_u64(out, sec);
-}
-#endif
-
-#define TABSTOP_SIZE 12
-
 static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
 {
        prt_str(out, name);
@@ -495,12 +497,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
        prt_newline(out);
 }
 
+#define TABSTOP_SIZE 12
+
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
        const struct time_unit *u;
        s64 f_mean = 0, d_mean = 0;
        u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
        int i;
+
+       if (stats->buffer) {
+               int cpu;
+
+               spin_lock_irq(&stats->lock);
+               for_each_possible_cpu(cpu)
+                       __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+               spin_unlock_irq(&stats->lock);
+       }
+
        /*
         * avoid divide by zero
         */
@@ -546,6 +560,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 
        pr_name_and_units(out, "min:", stats->min_duration);
        pr_name_and_units(out, "max:", stats->max_duration);
+       pr_name_and_units(out, "total:", stats->total_duration);
 
        prt_printf(out, "mean:");
        prt_tab(out);
@@ -603,6 +618,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
                last_q = q;
        }
 }
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
 
 void bch2_time_stats_exit(struct bch2_time_stats *stats)
 {
index 0595605e3180c4303b5f51598708e751f288fbaa..54e309d94b9bebeb43c0503b145b6ad38b0771c5 100644 (file)
@@ -244,7 +244,7 @@ do {                                                                        \
 #define prt_bitflags(...)              bch2_prt_bitflags(__VA_ARGS__)
 
 void bch2_pr_time_units(struct printbuf *, u64);
-void bch2_prt_date_seconds(struct printbuf *, time64_t);
+void bch2_prt_datetime(struct printbuf *, time64_t);
 
 #ifdef __KERNEL__
 static inline void uuid_unparse_lower(u8 *uuid, char *out)
@@ -372,8 +372,9 @@ struct bch2_time_stat_buffer {
 struct bch2_time_stats {
        spinlock_t      lock;
        /* all fields are in nanoseconds */
-       u64             max_duration;
        u64             min_duration;
+       u64             max_duration;
+       u64             total_duration;
        u64             max_freq;
        u64             min_freq;
        u64             last_event;
@@ -388,15 +389,39 @@ struct bch2_time_stats {
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
 
 static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
 {
        __bch2_time_stats_update(stats, start, local_clock());
 }
 
+static inline bool track_event_change(struct bch2_time_stats *stats,
+                                     u64 *start, bool v)
+{
+       if (v != !!*start) {
+               if (!v) {
+                       bch2_time_stats_update(stats, *start);
+                       *start = 0;
+               } else {
+                       *start = local_clock() ?: 1;
+                       return true;
+               }
+       }
+
+       return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+                                     u64 *start, bool v)
+{
+       bool ret = v && !*start;
+       *start = v;
+       return ret;
+}
+#endif
+
 void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
 void bch2_time_stats_exit(struct bch2_time_stats *);