]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 717b356d1d bcachefs: Convert journal validation to bkey_in...
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 5 Aug 2023 22:06:22 +0000 (18:06 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 6 Aug 2023 17:08:36 +0000 (13:08 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
52 files changed:
.bcachefs_revision
cmd_device.c
cmd_dump.c
cmd_kill_btree_node.c
libbcachefs/alloc_foreground.h
libbcachefs/bcachefs.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_journal_iter.c [new file with mode: 0644]
libbcachefs/btree_journal_iter.h [new file with mode: 0644]
libbcachefs/btree_trans_commit.c [moved from libbcachefs/btree_update_leaf.c with 55% similarity]
libbcachefs/btree_update.c [new file with mode: 0644]
libbcachefs/btree_update_interior.c
libbcachefs/buckets.h
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/disk_groups.c
libbcachefs/errcode.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io-buffered.c [new file with mode: 0644]
libbcachefs/fs-io-buffered.h [new file with mode: 0644]
libbcachefs/fs-io-direct.c [new file with mode: 0644]
libbcachefs/fs-io-direct.h [new file with mode: 0644]
libbcachefs/fs-io-pagecache.c [new file with mode: 0644]
libbcachefs/fs-io-pagecache.h [new file with mode: 0644]
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/movinggc.c
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/sb-clean.c [new file with mode: 0644]
libbcachefs/sb-clean.h [new file with mode: 0644]
libbcachefs/sb-members.c [new file with mode: 0644]
libbcachefs/sb-members.h [new file with mode: 0644]
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h

index b2d874146ec6b06a9bcd8ff0ae205403bdfa0b73..d78b2e28bd2ee558f6212688f8330a8dce72b6f5 100644 (file)
@@ -1 +1 @@
-5b8c4a1366df20bc043404cb882230ce86296590
+717b356d1dfdf178ac46e217c81bb710b7e77032
index c59d37094761dccb5d83b9905d34cce451714c75..1914629bbc234fafafe35e11e4a6cd50b01dcb5d 100644 (file)
@@ -16,6 +16,7 @@
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/journal.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super-io.h"
 #include "cmds.h"
 #include "libbcachefs.h"
index ab94707a84d6b3025566512183c1fbcf73be4779..e1f0a5aa6d198f7f70b2fce505610d63c6b6b17f 100644 (file)
@@ -13,6 +13,7 @@
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/error.h"
 #include "libbcachefs/extents.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super.h"
 
 static void dump_usage(void)
index a8915a1f7c2921f78de255a1a904a2d5508b1751..e9b8265d40e8016c76693ee19f8e398c84490a5c 100644 (file)
@@ -11,6 +11,7 @@
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/error.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super.h"
 
 static void kill_btree_node_usage(void)
index fee195f7eabfce5b6ef4dea079a8c98b88c9eebb..7aaeec44c7466cfd8aebf4d7daa9c099ff751fd2 100644 (file)
@@ -5,7 +5,7 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
 
 #include <linux/hash.h>
 
index e1f1e8e871a81259104a0e16c43fdba739995aa2..30b3d7b9f9dc14de467685e653a2624201a6cf5a 100644 (file)
@@ -294,8 +294,8 @@ do {                                                                        \
 
 #define bch_err_fn(_c, _ret)                                           \
         bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
-#define bch_err_msg(_c, _ret, _msg)                                    \
-        bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
+#define bch_err_msg(_c, _ret, _msg, ...)                               \
+        bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret))
 
 #define bch_verbose(c, fmt, ...)                                       \
 do {                                                                   \
@@ -995,6 +995,7 @@ struct bch_fs {
        enum bch_recovery_pass  curr_recovery_pass;
        /* bitmap of explicitly enabled recovery passes: */
        u64                     recovery_passes_explicit;
+       u64                     recovery_passes_complete;
 
        /* DEBUG JUNK */
        struct dentry           *fs_debug_dir;
@@ -1139,22 +1140,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
        return dev < c->sb.nr_devices && c->devs[dev];
 }
 
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-                                                 enum bch_recovery_pass pass)
-{
-       c->recovery_passes_explicit |= BIT_ULL(pass);
-
-       if (c->curr_recovery_pass >= pass) {
-               c->curr_recovery_pass = pass;
-               return -BCH_ERR_restart_recovery;
-       } else {
-               return 0;
-       }
-}
-
 #define BKEY_PADDED_ONSTACK(key, pad)                          \
        struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
index ee7ba700e75f4ee3afbac3ab241c7d6d9b1278c3..d6960e259c804897137220f3f26ad734a087b35f 100644 (file)
@@ -7,14 +7,6 @@
 #include "bset.h"
 #include "util.h"
 
-#undef EBUG_ON
-
-#ifdef DEBUG_BKEYS
-#define EBUG_ON(cond)          BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *out,
@@ -184,6 +176,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
        return v + offset;
 }
 
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+       unsigned bits = state->format->bits_per_field[field];
+
+       if (bits) {
+               if (bits > state->bits) {
+                       bits -= state->bits;
+                       /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+                       state->w |= (v >> 1) >> (bits - 1);
+
+                       *state->p = state->w;
+                       state->p = next_word(state->p);
+                       state->w = 0;
+                       state->bits = 64;
+               }
+
+               state->bits -= bits;
+               state->w |= v << state->bits;
+       }
+}
+
 __always_inline
 static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 {
@@ -198,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
        if (fls64(v) > bits)
                return false;
 
-       if (bits > state->bits) {
-               bits -= state->bits;
-               /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-               state->w |= (v >> 1) >> (bits - 1);
-
-               *state->p = state->w;
-               state->p = next_word(state->p);
-               state->w = 0;
-               state->bits = 64;
-       }
-
-       state->bits -= bits;
-       state->w |= v << state->bits;
-
+       __set_inc_field(state, field, v);
        return true;
 }
 
@@ -380,19 +381,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
                ret = false;
        }
 
-       if (bits > state->bits) {
-               bits -= state->bits;
-               state->w |= (v >> 1) >> (bits - 1);
-
-               *state->p = state->w;
-               state->p = next_word(state->p);
-               state->w = 0;
-               state->bits = 64;
-       }
-
-       state->bits -= bits;
-       state->w |= v << state->bits;
-
+       __set_inc_field(state, field, v);
        return ret;
 }
 
@@ -435,6 +424,24 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
        return false;
 }
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+       for (unsigned i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+               u64 packed_max = f->bits_per_field[i]
+                       ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+                       : 0;
+               u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+               if (packed_max + field_offset < packed_max ||
+                   packed_max + field_offset > unpacked_max)
+                       return true;
+       }
+
+       return false;
+}
 #endif
 
 /*
@@ -515,7 +522,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 
                BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
                BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-                      bkey_cmp_left_packed(b, &successor, &orig) < 0);
+                      bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+                      !bkey_format_has_too_big_fields(f));
        }
 #endif
 
@@ -604,40 +612,74 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
                }
        }
 
-       EBUG_ON(bch2_bkey_format_validate(&ret));
+#ifdef CONFIG_BCACHEFS_DEBUG
+       {
+               struct printbuf buf = PRINTBUF;
+
+               BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+               printbuf_exit(&buf);
+       }
+#endif
        return ret;
 }
 
-const char *bch2_bkey_format_validate(struct bkey_format *f)
+int bch2_bkey_format_invalid(struct bch_fs *c,
+                            struct bkey_format *f,
+                            enum bkey_invalid_flags flags,
+                            struct printbuf *err)
 {
        unsigned i, bits = KEY_PACKED_BITS_START;
 
-       if (f->nr_fields != BKEY_NR_FIELDS)
-               return "incorrect number of fields";
+       if (f->nr_fields != BKEY_NR_FIELDS) {
+               prt_printf(err, "incorrect number of fields: got %u, should be %u",
+                          f->nr_fields, BKEY_NR_FIELDS);
+               return -BCH_ERR_invalid;
+       }
 
        /*
         * Verify that the packed format can't represent fields larger than the
         * unpacked format:
         */
        for (i = 0; i < f->nr_fields; i++) {
-               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-               u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-               u64 packed_max = f->bits_per_field[i]
-                       ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-                       : 0;
-               u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-               if (packed_max + field_offset < packed_max ||
-                   packed_max + field_offset > unpacked_max)
-                       return "field too large";
+               if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+                       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+                       u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+                       u64 packed_max = f->bits_per_field[i]
+                               ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+                               : 0;
+                       u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+                       if (packed_max + field_offset < packed_max ||
+                           packed_max + field_offset > unpacked_max) {
+                               prt_printf(err, "field %u too large: %llu + %llu > %llu",
+                                          i, packed_max, field_offset, unpacked_max);
+                               return -BCH_ERR_invalid;
+                       }
+               }
 
                bits += f->bits_per_field[i];
        }
 
-       if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-               return "incorrect key_u64s";
+       if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+               prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+                          f->key_u64s, DIV_ROUND_UP(bits, 64));
+               return -BCH_ERR_invalid;
+       }
+
+       return 0;
+}
 
-       return NULL;
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+       prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+       for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+               if (i)
+                       prt_str(out, ", ");
+               prt_printf(out, "%u:%llu",
+                          f->bits_per_field[i],
+                          le64_to_cpu(f->field_offset[i]));
+       }
 }
 
 /*
index e81fb3e00c602dfca2e544ac85de7b4584c5b92d..51969a46265e124233028dfbf6b3e39137cb383a 100644 (file)
@@ -9,6 +9,12 @@
 #include "util.h"
 #include "vstructs.h"
 
+enum bkey_invalid_flags {
+       BKEY_INVALID_WRITE              = (1U << 0),
+       BKEY_INVALID_COMMIT             = (1U << 1),
+       BKEY_INVALID_JOURNAL            = (1U << 2),
+};
+
 #if 0
 
 /*
@@ -769,6 +775,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
 
 void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
 struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+                            enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
 
 #endif /* _BCACHEFS_BKEY_H */
index d7b63769068c7464e709d1724d4a53c39c3bc4f4..668f595e2fcfeadf29547bd33d46448bad6dc7e3 100644 (file)
@@ -13,12 +13,6 @@ enum btree_node_type;
 extern const char * const bch2_bkey_types[];
 extern const struct bkey_ops bch2_bkey_null_ops;
 
-enum bkey_invalid_flags {
-       BKEY_INVALID_WRITE              = (1U << 0),
-       BKEY_INVALID_COMMIT             = (1U << 1),
-       BKEY_INVALID_JOURNAL            = (1U << 2),
-};
-
 /*
  * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
  * invalid, entire key will be deleted.
index 13c88d9533e5cdf5fa6bf58a397b5d71284494a3..a8283fdc7e63929d0a088e3b9bb6624e240f4de1 100644 (file)
@@ -1214,7 +1214,6 @@ wait_on_io:
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
                             const struct btree *b)
 {
-       const struct bkey_format *f = &b->format;
        struct bset_stats stats;
 
        memset(&stats, 0, sizeof(stats));
@@ -1228,9 +1227,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
        prt_printf(out, ":\n"
               "    ptrs: ");
        bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+       prt_newline(out);
 
-       prt_printf(out, "\n"
-              "    format: u64s %u fields %u %u %u %u %u\n"
+       prt_printf(out,
+              "    format: ");
+       bch2_bkey_format_to_text(out, &b->format);
+
+       prt_printf(out,
               "    unpack fn len: %u\n"
               "    bytes used %zu/%zu (%zu%% full)\n"
               "    sib u64s: %u, %u (merge threshold %u)\n"
@@ -1238,12 +1241,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               "    nr unpacked keys %u\n"
               "    floats %zu\n"
               "    failed unpacked %zu\n",
-              f->key_u64s,
-              f->bits_per_field[0],
-              f->bits_per_field[1],
-              f->bits_per_field[2],
-              f->bits_per_field[3],
-              f->bits_per_field[4],
               b->unpack_fn_len,
               b->nr.live_u64s * sizeof(u64),
               btree_bytes(c) - sizeof(struct btree_node),
index 49e9822dda1dd8d07b7be885aefdefe649345d34..a5f685fff604b6b0a620f555544fcec6477e2f85 100644 (file)
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
@@ -43,7 +44,7 @@
 static bool should_restart_for_topology_repair(struct bch_fs *c)
 {
        return c->opts.fix_errors != FSCK_FIX_no &&
-               !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+               !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
 }
 
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
index b45e382f7055b1a28a045707bf1116b04a31abc8..607575f83a00232b67d026a0423f9d486f044f2e 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_GC_H
 #define _BCACHEFS_BTREE_GC_H
 
+#include "bkey.h"
 #include "btree_types.h"
 
 int bch2_check_topology(struct bch_fs *);
index c049876ee80be3cc4ca5e7813ddd42e1a9194490..3b654841ab00675b27220334813cc508227c366a 100644 (file)
@@ -17,6 +17,7 @@
 #include "io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "recovery.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -543,31 +544,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
        prt_str(out, ": ");
 }
 
-enum btree_err_type {
-       /*
-        * We can repair this locally, and we're after the checksum check so
-        * there's no need to try another replica:
-        */
-       BTREE_ERR_FIXABLE,
-       /*
-        * We can repair this if we have to, but we should try reading another
-        * replica if we can:
-        */
-       BTREE_ERR_WANT_RETRY,
-       /*
-        * Read another replica if we have one, otherwise consider the whole
-        * node bad:
-        */
-       BTREE_ERR_MUST_RETRY,
-       BTREE_ERR_BAD_NODE,
-       BTREE_ERR_INCOMPATIBLE,
-};
-
-enum btree_validate_ret {
-       BTREE_RETRY_READ = 64,
-};
-
-static int __btree_err(enum btree_err_type type,
+static int __btree_err(int ret,
                       struct bch_fs *c,
                       struct bch_dev *ca,
                       struct btree *b,
@@ -578,7 +555,6 @@ static int __btree_err(enum btree_err_type type,
 {
        struct printbuf out = PRINTBUF;
        va_list args;
-       int ret = -BCH_ERR_fsck_fix;
 
        btree_err_msg(&out, c, ca, b, i, b->written, write);
 
@@ -594,27 +570,26 @@ static int __btree_err(enum btree_err_type type,
                goto out;
        }
 
-       if (!have_retry && type == BTREE_ERR_WANT_RETRY)
-               type = BTREE_ERR_FIXABLE;
-       if (!have_retry && type == BTREE_ERR_MUST_RETRY)
-               type = BTREE_ERR_BAD_NODE;
+       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+               ret = -BCH_ERR_btree_node_read_err_fixable;
+       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+               ret = -BCH_ERR_btree_node_read_err_bad_node;
 
-       switch (type) {
-       case BTREE_ERR_FIXABLE:
+       switch (ret) {
+       case -BCH_ERR_btree_node_read_err_fixable:
                mustfix_fsck_err(c, "%s", out.buf);
                ret = -BCH_ERR_fsck_fix;
                break;
-       case BTREE_ERR_WANT_RETRY:
-       case BTREE_ERR_MUST_RETRY:
+       case -BCH_ERR_btree_node_read_err_want_retry:
+       case -BCH_ERR_btree_node_read_err_must_retry:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
-               ret = BTREE_RETRY_READ;
                break;
-       case BTREE_ERR_BAD_NODE:
+       case -BCH_ERR_btree_node_read_err_bad_node:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
                bch2_topology_error(c);
                ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
                break;
-       case BTREE_ERR_INCOMPATIBLE:
+       case -BCH_ERR_btree_node_read_err_incompatible:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
                ret = -BCH_ERR_fsck_errors_not_fixed;
                break;
@@ -631,8 +606,11 @@ fsck_err:
 ({                                                                     \
        int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
                                                                        \
-       if (_ret != -BCH_ERR_fsck_fix)                                  \
+       if (_ret != -BCH_ERR_fsck_fix) {                                \
+               ret = _ret;                                             \
                goto fsck_err;                                          \
+       }                                                               \
+                                                                       \
        *saw_error = true;                                              \
 })
 
@@ -696,19 +674,18 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         int write, bool have_retry, bool *saw_error)
 {
        unsigned version = le16_to_cpu(i->version);
-       const char *err;
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
        int ret = 0;
 
        btree_err_on(!bch2_version_compatible(version),
-                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
                     "unsupported bset version %u.%u",
                     BCH_VERSION_MAJOR(version),
                     BCH_VERSION_MINOR(version));
 
        if (btree_err_on(version < c->sb.version_min,
-                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
                         "bset version %u older than superblock version_min %u",
                         version, c->sb.version_min)) {
                mutex_lock(&c->sb_lock);
@@ -719,7 +696,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
        if (btree_err_on(BCH_VERSION_MAJOR(version) >
                         BCH_VERSION_MAJOR(c->sb.version),
-                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
                         "bset version %u newer than superblock version %u",
                         version, c->sb.version)) {
                mutex_lock(&c->sb_lock);
@@ -729,11 +706,11 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        }
 
        btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
                     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
        if (btree_err_on(offset + sectors > btree_sectors(c),
-                        BTREE_ERR_FIXABLE, c, ca, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
                         "bset past end of btree node")) {
                i->u64s = 0;
                ret = 0;
@@ -741,12 +718,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        }
 
        btree_err_on(offset && !i->u64s,
-                    BTREE_ERR_FIXABLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
                     "empty bset");
 
        btree_err_on(BSET_OFFSET(i) &&
                     BSET_OFFSET(i) != offset,
-                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
                     "bset at wrong sector offset");
 
        if (!offset) {
@@ -760,16 +737,16 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
                        /* XXX endianness */
                        btree_err_on(bp->seq != bn->keys.seq,
-                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                                     "incorrect sequence number (wrong btree node)");
                }
 
                btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
                             "incorrect btree id");
 
                btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
                             "incorrect level");
 
                if (!write)
@@ -786,7 +763,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                        }
 
                        btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                                     "incorrect min_key: got %s should be %s",
                                     (printbuf_reset(&buf1),
                                      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@@ -795,7 +772,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                }
 
                btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
                             "incorrect max key %s",
                             (printbuf_reset(&buf1),
                              bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@@ -804,10 +781,12 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
 
-               err = bch2_bkey_format_validate(&bn->format);
-               btree_err_on(err,
-                            BTREE_ERR_BAD_NODE, c, ca, b, i,
-                            "invalid bkey format: %s", err);
+               btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+                            -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
+                            "invalid bkey format: %s\n  %s", buf1.buf,
+                            (printbuf_reset(&buf2),
+                             bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+               printbuf_reset(&buf1);
 
                compat_bformat(b->c.level, b->c.btree_id, version,
                               BSET_BIG_ENDIAN(i), write,
@@ -847,14 +826,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                struct bkey tmp;
 
                if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-                                BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
                                 "key extends past end of bset")) {
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
                if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-                                BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
                                 "invalid bkey format %u", k->format)) {
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_p_next(k),
@@ -878,7 +857,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                        prt_printf(&buf, "\n  ");
                        bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+                       btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_p_next(k),
@@ -902,7 +881,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
                        bch2_dump_bset(c, b, i, 0);
 
-                       if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
+                       if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
                                i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                                memmove_u64s_down(k, bkey_p_next(k),
                                                  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -945,16 +924,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        iter->size = (btree_blocks(c) + 1) * 2;
 
        if (bch2_meta_read_fault("btree"))
-               btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+               btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                          "dynamic fault");
 
        btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                    -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                     "bad magic: want %llx, got %llx",
                     bset_magic(c), le64_to_cpu(b->data->magic));
 
        btree_err_on(!b->data->keys.seq,
-                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                    -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                     "bad btree header: seq 0");
 
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -962,7 +941,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        &bkey_i_to_btree_ptr_v2(&b->key)->v;
 
                btree_err_on(b->data->keys.seq != bp->seq,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                            -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
                             "got wrong btree node (seq %llx want %llx)",
                             b->data->keys.seq, bp->seq);
        }
@@ -977,7 +956,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        i = &b->data->keys;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
                                     "unknown checksum type %llu",
                                     BSET_CSUM_TYPE(i));
 
@@ -985,7 +964,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
                        btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
                                     "invalid checksum");
 
                        ret = bset_encrypt(c, i, b->written << 9);
@@ -995,7 +974,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
                        btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
                                     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-                                    BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
                                     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
                        sectors = vstruct_sectors(b->data, c->block_bits);
@@ -1007,7 +986,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                break;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
                                     "unknown checksum type %llu",
                                     BSET_CSUM_TYPE(i));
 
@@ -1015,7 +994,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 
                        btree_err_on(bch2_crc_cmp(csum, bne->csum),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
                                     "invalid checksum");
 
                        ret = bset_encrypt(c, i, b->written << 9);
@@ -1048,12 +1027,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                        true);
 
                btree_err_on(blacklisted && first,
-                            BTREE_ERR_FIXABLE, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
                             "first btree node bset has blacklisted journal seq (%llu)",
                             le64_to_cpu(i->journal_seq));
 
                btree_err_on(blacklisted && ptr_written,
-                            BTREE_ERR_FIXABLE, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
                             "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
                             le64_to_cpu(i->journal_seq),
                             b->written, b->written + sectors, ptr_written);
@@ -1072,7 +1051,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
        if (ptr_written) {
                btree_err_on(b->written < ptr_written,
-                            BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+                            -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
                             "btree node data missing: expected %u sectors, found %u",
                             ptr_written, b->written);
        } else {
@@ -1083,7 +1062,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                     !bch2_journal_seq_is_blacklisted(c,
                                                                      le64_to_cpu(bne->keys.journal_seq),
                                                                      true),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
                                     "found bset signature after last bset");
 
                /*
@@ -1137,7 +1116,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        prt_printf(&buf, "\n  ");
                        bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+                       btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 
                        btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1177,7 +1156,8 @@ out:
        printbuf_exit(&buf);
        return retry_read;
 fsck_err:
-       if (ret == BTREE_RETRY_READ)
+       if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+           ret == -BCH_ERR_btree_node_read_err_must_retry)
                retry_read = 1;
        else
                set_btree_node_read_error(b);
@@ -1363,14 +1343,14 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
                }
 
                written2 = btree_node_sectors_written(c, ra->buf[i]);
-               if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+               if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
                                 "btree node sectors written mismatch: %u != %u",
                                 written, written2) ||
                    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
                                 "found bset signature after last bset") ||
                    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
                                 "btree node replicas content mismatch"))
                        dump_bset_maps = true;
 
index dfb77b23d1261030994b97d0e48a06ca2e2299b2..06dbb61710a9b686e4c02ec86f9c12c73e3b40a4 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
@@ -12,7 +13,6 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "subvolume.h"
 #include "trace.h"
index c472aa8c58a09b8181aab307cd2ced3508c17545..8876f2b829fadc8c55830005bc6e06e0ab2c28ab 100644 (file)
@@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
                                 unsigned, unsigned, unsigned, unsigned long);
 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+       if (k.k && bpos_eq(path->pos, k.k->p))
+               return k;
+
+       bkey_init(u);
+       u->p = path->pos;
+       return (struct bkey_s_c) { u, NULL };
+}
+
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
                                        struct btree_iter *, struct bpos);
 
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
new file mode 100644 (file)
index 0000000..58a981b
--- /dev/null
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id     l_btree_id,
+                            unsigned           l_level,
+                            struct bpos        l_pos,
+                            const struct journal_key *r)
+{
+       return (cmp_int(l_btree_id,     r->btree_id) ?:
+               cmp_int(l_level,        r->level) ?:
+               bpos_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+       size_t gap_size = keys->size - keys->nr;
+
+       if (idx >= keys->gap)
+               idx += gap_size;
+       return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+       return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+                                       enum btree_id id, unsigned level,
+                                       struct bpos pos)
+{
+       size_t l = 0, r = keys->nr, m;
+
+       while (l < r) {
+               m = l + ((r - l) >> 1);
+               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+                       l = m + 1;
+               else
+                       r = m;
+       }
+
+       BUG_ON(l < keys->nr &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+       BUG_ON(l &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+       return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+                                     enum btree_id id, unsigned level,
+                                     struct bpos pos)
+{
+       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos,
+                                          struct bpos end_pos, size_t *idx)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       unsigned iters = 0;
+       struct journal_key *k;
+search:
+       if (!*idx)
+               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+       while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+               if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+                       return NULL;
+
+               if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+                   !k->overwritten)
+                       return k->k;
+
+               (*idx)++;
+               iters++;
+               if (iters == 10) {
+                       *idx = 0;
+                       goto search;
+               }
+       }
+
+       return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos)
+{
+       size_t idx = 0;
+
+       return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       /* The key we just inserted is immediately before the gap: */
+       size_t gap_end = keys->gap + (keys->size - keys->nr);
+       struct btree_and_journal_iter *iter;
+
+       /*
+        * If an iterator points one after the key we just inserted, decrement
+        * the iterator so it points at the key we just inserted - if the
+        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+        * handle that:
+        */
+       list_for_each_entry(iter, &c->journal_iters, journal.list)
+               if (iter->journal.idx == gap_end)
+                       iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       size_t gap_size = keys->size - keys->nr;
+
+       list_for_each_entry(iter, &c->journal_iters, list) {
+               if (iter->idx > old_gap)
+                       iter->idx -= gap_size;
+               if (iter->idx >= new_gap)
+                       iter->idx += gap_size;
+       }
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+                                unsigned level, struct bkey_i *k)
+{
+       struct journal_key n = {
+               .btree_id       = id,
+               .level          = level,
+               .k              = k,
+               .allocated      = true,
+               /*
+                * Ensure these keys are done last by journal replay, to unblock
+                * journal reclaim:
+                */
+               .journal_seq    = U32_MAX,
+       };
+       struct journal_keys *keys = &c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+       BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+       if (idx < keys->size &&
+           journal_key_cmp(&n, &keys->d[idx]) == 0) {
+               if (keys->d[idx].allocated)
+                       kfree(keys->d[idx].k);
+               keys->d[idx] = n;
+               return 0;
+       }
+
+       if (idx > keys->gap)
+               idx -= keys->size - keys->nr;
+
+       if (keys->nr == keys->size) {
+               struct journal_keys new_keys = {
+                       .nr                     = keys->nr,
+                       .size                   = max_t(size_t, keys->size, 8) * 2,
+               };
+
+               new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+               if (!new_keys.d) {
+                       bch_err(c, "%s: error allocating new key array (size %zu)",
+                               __func__, new_keys.size);
+                       return -BCH_ERR_ENOMEM_journal_key_insert;
+               }
+
+               /* Since @keys was full, there was no gap: */
+               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+               kvfree(keys->d);
+               *keys = new_keys;
+
+               /* And now the gap is at the end: */
+               keys->gap = keys->nr;
+       }
+
+       journal_iters_move_gap(c, keys->gap, idx);
+
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+       keys->gap = idx;
+
+       keys->nr++;
+       keys->d[keys->gap++] = n;
+
+       journal_iters_fix(c);
+
+       return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bkey_i *k)
+{
+       struct bkey_i *n;
+       int ret;
+
+       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+       if (!n)
+               return -BCH_ERR_ENOMEM_journal_key_insert;
+
+       bkey_copy(n, k);
+       ret = bch2_journal_key_insert_take(c, id, level, n);
+       if (ret)
+               kfree(n);
+       return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bpos pos)
+{
+       struct bkey_i whiteout;
+
+       bkey_init(&whiteout.k);
+       whiteout.k.p = pos;
+
+       return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+                                 unsigned level, struct bpos pos)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+       if (idx < keys->size &&
+           keys->d[idx].btree_id       == btree &&
+           keys->d[idx].level          == level &&
+           bpos_eq(keys->d[idx].k->k.p, pos))
+               keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->idx < iter->keys->size) {
+               iter->idx++;
+               if (iter->idx == iter->keys->gap)
+                       iter->idx += iter->keys->size - iter->keys->nr;
+       }
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+       struct journal_key *k = iter->keys->d + iter->idx;
+
+       while (k < iter->keys->d + iter->keys->size &&
+              k->btree_id      == iter->btree_id &&
+              k->level         == iter->level) {
+               if (!k->overwritten)
+                       return bkey_i_to_s_c(k->k);
+
+               bch2_journal_iter_advance(iter);
+               k = iter->keys->d + iter->idx;
+       }
+
+       return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+       list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+                                  struct journal_iter *iter,
+                                  enum btree_id id, unsigned level,
+                                  struct bpos pos)
+{
+       iter->btree_id  = id;
+       iter->level     = level;
+       iter->keys      = &c->journal_keys;
+       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                               iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+       if (bpos_eq(iter->pos, SPOS_MAX))
+               iter->at_end = true;
+       else
+               iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+       struct bkey_s_c btree_k, journal_k, ret;
+again:
+       if (iter->at_end)
+               return bkey_s_c_null;
+
+       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+              bpos_lt(btree_k.k->p, iter->pos))
+               bch2_journal_iter_advance_btree(iter);
+
+       while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+              bpos_lt(journal_k.k->p, iter->pos))
+               bch2_journal_iter_advance(&iter->journal);
+
+       ret = journal_k.k &&
+               (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+               ? journal_k
+               : btree_k;
+
+       if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+               ret = bkey_s_c_null;
+
+       if (ret.k) {
+               iter->pos = ret.k->p;
+               if (bkey_deleted(ret.k)) {
+                       bch2_btree_and_journal_iter_advance(iter);
+                       goto again;
+               }
+       } else {
+               iter->pos = SPOS_MAX;
+               iter->at_end = true;
+       }
+
+       return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+       bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                                 struct bch_fs *c,
+                                                 struct btree *b,
+                                                 struct btree_node_iter node_iter,
+                                                 struct bpos pos)
+{
+       memset(iter, 0, sizeof(*iter));
+
+       iter->b = b;
+       iter->node_iter = node_iter;
+       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+       INIT_LIST_HEAD(&iter->journal.list);
+       iter->pos = b->data->min_key;
+       iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct bch_fs *c,
+                                               struct btree *b)
+{
+       struct btree_node_iter node_iter;
+
+       bch2_btree_node_iter_init_from_start(&node_iter, b);
+       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+       list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+       struct journal_replay **i;
+       struct genradix_iter iter;
+
+       genradix_for_each(&c->journal_entries, iter, i)
+               if (*i)
+                       kvpfree(*i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&(*i)->j));
+       genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+       const struct journal_key *l = _l;
+       const struct journal_key *r = _r;
+
+       return  journal_key_cmp(l, r) ?:
+               cmp_int(l->journal_seq, r->journal_seq) ?:
+               cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_free(struct journal_keys *keys)
+{
+       struct journal_key *i;
+
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
+       for (i = keys->d; i < keys->d + keys->nr; i++)
+               if (i->allocated)
+                       kfree(i->k);
+
+       kvfree(keys->d);
+       keys->d = NULL;
+       keys->nr = keys->gap = keys->size = 0;
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+       struct journal_key *src, *dst;
+
+       sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+       src = dst = keys->d;
+       while (src < keys->d + keys->nr) {
+               while (src + 1 < keys->d + keys->nr &&
+                      src[0].btree_id  == src[1].btree_id &&
+                      src[0].level     == src[1].level &&
+                      bpos_eq(src[0].k->k.p, src[1].k->k.p))
+                       src++;
+
+               *dst++ = *src++;
+       }
+
+       keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+       struct genradix_iter iter;
+       struct journal_replay *i, **_i;
+       struct jset_entry *entry;
+       struct bkey_i *k;
+       struct journal_keys *keys = &c->journal_keys;
+       size_t nr_keys = 0, nr_read = 0;
+
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               for_each_jset_key(k, entry, &i->j)
+                       nr_keys++;
+       }
+
+       if (!nr_keys)
+               return 0;
+
+       keys->size = roundup_pow_of_two(nr_keys);
+
+       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+       if (!keys->d) {
+               bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+                       nr_keys);
+
+               do {
+                       keys->size >>= 1;
+                       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+               } while (!keys->d && keys->size > nr_keys / 8);
+
+               if (!keys->d) {
+                       bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+                               keys->size);
+                       return -BCH_ERR_ENOMEM_journal_keys_sort;
+               }
+       }
+
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               cond_resched();
+
+               for_each_jset_key(k, entry, &i->j) {
+                       if (keys->nr == keys->size) {
+                               __journal_keys_sort(keys);
+
+                               if (keys->nr > keys->size * 7 / 8) {
+                                       bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+                                               keys->nr, keys->size, nr_read, nr_keys);
+                                       return -BCH_ERR_ENOMEM_journal_keys_sort;
+                               }
+                       }
+
+                       keys->d[keys->nr++] = (struct journal_key) {
+                               .btree_id       = entry->btree_id,
+                               .level          = entry->level,
+                               .k              = k,
+                               .journal_seq    = le64_to_cpu(i->j.seq),
+                               .journal_offset = k->_data - i->j._data,
+                       };
+
+                       nr_read++;
+               }
+       }
+
+       __journal_keys_sort(keys);
+       keys->gap = keys->nr;
+
+       bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+       return 0;
+}
diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h
new file mode 100644 (file)
index 0000000..5d64e7e
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+       struct list_head        list;
+       enum btree_id           btree_id;
+       unsigned                level;
+       size_t                  idx;
+       struct journal_keys     *keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+       struct btree            *b;
+       struct btree_node_iter  node_iter;
+       struct bkey             unpacked;
+
+       struct journal_iter     journal;
+       struct bpos             pos;
+       bool                    at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+                               unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+                                          unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+                                unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+                           unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+                           unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+                                 unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                               struct bch_fs *, struct btree *,
+                               struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                                               struct bch_fs *,
+                                               struct btree *);
+
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
similarity index 55%
rename from libbcachefs/btree_update_leaf.c
rename to libbcachefs/btree_trans_commit.c
index 369e37a415f3c61733ff83a494c07d0cb9b3a34c..78a09aa050c7052d1a5a7ee6792f595c35831299 100644 (file)
@@ -1,45 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
-#include "btree_locking.h"
+#include "btree_update_interior.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
-#include "debug.h"
 #include "errcode.h"
 #include "error.h"
-#include "extent_update.h"
 #include "journal.h"
 #include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery.h"
-#include "subvolume.h"
 #include "replicas.h"
-#include "trace.h"
+#include "subvolume.h"
 
 #include <linux/prefetch.h>
-#include <linux/sort.h>
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-       if (k.k && bpos_eq(path->pos, k.k->p))
-               return k;
-
-       bkey_init(u);
-       u->p = path->pos;
-       return (struct bkey_s_c) { u, NULL };
-}
 
 static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
 {
@@ -64,20 +41,6 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
 #endif
 }
 
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-                         struct bkey_i *, enum btree_update_flags,
-                         unsigned long ip);
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-                                        const struct btree_insert_entry *r)
-{
-       return   cmp_int(l->btree_id,   r->btree_id) ?:
-                cmp_int(l->cached,     r->cached) ?:
-                -cmp_int(l->level,     r->level) ?:
-                bpos_cmp(l->k->k.p,    r->k->k.p);
-}
-
 static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
 {
        return i->path->l + i->level;
@@ -1191,917 +1154,3 @@ err:
 
        goto retry;
 }
-
-static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
-                                         enum btree_id id,
-                                         struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, id, pos,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       while (1) {
-               k = bch2_btree_iter_prev(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               if (!k.k)
-                       break;
-
-               if (!bkey_eq(pos, k.k->p))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-                       ret = 1;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
-                                         enum btree_id id,
-                                         struct bpos pos)
-{
-       if (!btree_type_has_snapshots(id) ||
-           bch2_snapshot_is_leaf(trans->c, pos.snapshot))
-               return 0;
-
-       return __check_pos_snapshot_overwritten(trans, id, pos);
-}
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      struct bkey_s_c k,
-                                      struct bkey_i **insert,
-                                      enum btree_update_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *update;
-       int ret;
-
-       update = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(update);
-       if (ret)
-               return ret;
-
-       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-               return 0;
-
-       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
-               check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       ret = bch2_btree_delete_at(trans, iter, flags);
-       if (ret)
-               return ret;
-
-       *insert = update;
-       return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-                                     struct btree_iter *iter,
-                                     struct bkey_i *insert,
-                                     struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
-               check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-       return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-                                     enum btree_id btree_id, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u32 snapshot = pos.snapshot;
-       int ret;
-
-       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-               return 0;
-
-       pos.snapshot++;
-
-       for_each_btree_key_norestart(trans, iter, btree_id, pos,
-                          BTREE_ITER_ALL_SNAPSHOTS|
-                          BTREE_ITER_NOPRESERVE, k, ret) {
-               if (!bkey_eq(k.k->p, pos))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-                                             k.k->p.snapshot)) {
-                       ret = !bkey_whiteout(k.k);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-                                  enum btree_id id,
-                                  struct bpos old_pos,
-                                  struct bpos new_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter old_iter, new_iter = { NULL };
-       struct bkey_s_c old_k, new_k;
-       snapshot_id_list s;
-       struct bkey_i *update;
-       int ret;
-
-       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-               return 0;
-
-       darray_init(&s);
-
-       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
-              !(ret = bkey_err(old_k)) &&
-              bkey_eq(old_pos, old_k.k->p)) {
-               struct bpos whiteout_pos =
-                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-
-               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-                       continue;
-
-               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-                                          BTREE_ITER_NOT_EXTENTS|
-                                          BTREE_ITER_INTENT);
-               ret = bkey_err(new_k);
-               if (ret)
-                       break;
-
-               if (new_k.k->type == KEY_TYPE_deleted) {
-                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-                       ret = PTR_ERR_OR_ZERO(update);
-                       if (ret)
-                               break;
-
-                       bkey_init(&update->k);
-                       update->k.p             = whiteout_pos;
-                       update->k.type          = KEY_TYPE_whiteout;
-
-                       ret = bch2_trans_update(trans, &new_iter, update,
-                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-               }
-               bch2_trans_iter_exit(trans, &new_iter);
-
-               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &new_iter);
-       bch2_trans_iter_exit(trans, &old_iter);
-       darray_exit(&s);
-
-       return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      enum btree_update_flags flags,
-                                      struct bkey_s_c old,
-                                      struct bkey_s_c new)
-{
-       enum btree_id btree_id = iter->btree_id;
-       struct bkey_i *update;
-       struct bpos new_start = bkey_start_pos(new.k);
-       bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-       bool back_split  = bkey_gt(old.k->p, new.k->p);
-       int ret = 0, compressed_sectors;
-
-       /*
-        * If we're going to be splitting a compressed extent, note it
-        * so that __bch2_trans_commit() can increase our disk
-        * reservation:
-        */
-       if (((front_split && back_split) ||
-            ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
-           (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-               trans->extra_journal_res += compressed_sectors;
-
-       if (front_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_back(new_start, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       /* If we're overwriting in a different snapshot - middle split: */
-       if (old.k->p.snapshot != new.k->p.snapshot &&
-           (front_split || back_split)) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new_start, update);
-               bch2_cut_back(new.k->p, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (bkey_le(old.k->p, new.k->p)) {
-               update = bch2_trans_kmalloc(trans, sizeof(*update));
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bkey_init(&update->k);
-               update->k.p = old.k->p;
-               update->k.p.snapshot = new.k->p.snapshot;
-
-               if (new.k->p.snapshot != old.k->p.snapshot) {
-                       update->k.type = KEY_TYPE_whiteout;
-               } else if (btree_type_has_snapshots(btree_id)) {
-                       ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-                       if (ret < 0)
-                               return ret;
-                       if (ret)
-                               update->k.type = KEY_TYPE_whiteout;
-               }
-
-               ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (back_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new.k->p, update);
-
-               ret = bch2_trans_update_by_path(trans, iter->path, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                         flags, _RET_IP_);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-                                   struct btree_iter *orig_iter,
-                                   struct bkey_i *insert,
-                                   enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       enum btree_id btree_id = orig_iter->btree_id;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_WITH_UPDATES|
-                            BTREE_ITER_NOT_EXTENTS);
-       k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-       if ((ret = bkey_err(k)))
-               goto err;
-       if (!k.k)
-               goto out;
-
-       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
-                       if (ret)
-                               goto err;
-               }
-
-               goto next;
-       }
-
-       while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-               bool done = bkey_lt(insert->k.p, k.k->p);
-
-               ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-               if (ret)
-                       goto err;
-
-               if (done)
-                       goto out;
-next:
-               bch2_btree_iter_advance(&iter);
-               k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       goto err;
-               if (!k.k)
-                       goto out;
-       }
-
-       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-               ret = extent_back_merge(trans, &iter, insert, k);
-               if (ret)
-                       goto err;
-       }
-out:
-       if (!bkey_deleted(&insert->k))
-               ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-                                           struct btree_path *path,
-                                           struct btree_insert_entry *i,
-                                           enum btree_update_flags flags,
-                                           unsigned long ip)
-{
-       struct btree_path *btree_path;
-       struct bkey k;
-       int ret;
-
-       btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                  BTREE_ITER_INTENT, _THIS_IP_);
-       ret = bch2_btree_path_traverse(trans, btree_path, 0);
-       if (ret)
-               goto out;
-
-       /*
-        * The old key in the insert entry might actually refer to an existing
-        * key in the btree that has been deleted from cache and not yet
-        * flushed. Check for this and skip the flush so we don't run triggers
-        * against a stale key.
-        */
-       bch2_btree_path_peek_slot_exact(btree_path, &k);
-       if (!bkey_deleted(&k))
-               goto out;
-
-       i->key_cache_already_flushed = true;
-       i->flags |= BTREE_TRIGGER_NORUN;
-
-       btree_path_set_should_be_locked(btree_path);
-       ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
-out:
-       bch2_path_put(trans, btree_path, true);
-       return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                         struct bkey_i *k, enum btree_update_flags flags,
-                         unsigned long ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i, n;
-       u64 seq = 0;
-       int cmp;
-
-       EBUG_ON(!path->should_be_locked);
-       EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-       EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-       /*
-        * The transaction journal res hasn't been allocated at this point.
-        * That occurs at commit time. Reuse the seq field to pass in the seq
-        * of a prejournaled key.
-        */
-       if (flags & BTREE_UPDATE_PREJOURNAL)
-               seq = trans->journal_res.seq;
-
-       n = (struct btree_insert_entry) {
-               .flags          = flags,
-               .bkey_type      = __btree_node_type(path->level, path->btree_id),
-               .btree_id       = path->btree_id,
-               .level          = path->level,
-               .cached         = path->cached,
-               .path           = path,
-               .k              = k,
-               .seq            = seq,
-               .ip_allocated   = ip,
-       };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(i != trans->updates &&
-                      btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-       /*
-        * Pending updates are kept sorted: first, find position of new update,
-        * then delete/trim any updates the new update overwrites:
-        */
-       trans_for_each_update(trans, i) {
-               cmp = btree_insert_entry_cmp(&n, i);
-               if (cmp <= 0)
-                       break;
-       }
-
-       if (!cmp && i < trans->updates + trans->nr_updates) {
-               EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-               bch2_path_put(trans, i->path, true);
-               i->flags        = n.flags;
-               i->cached       = n.cached;
-               i->k            = n.k;
-               i->path         = n.path;
-               i->seq          = n.seq;
-               i->ip_allocated = n.ip_allocated;
-       } else {
-               array_insert_item(trans->updates, trans->nr_updates,
-                                 i - trans->updates, n);
-
-               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-               if (unlikely(trans->journal_replay_not_finished)) {
-                       struct bkey_i *j_k =
-                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-                       if (j_k) {
-                               i->old_k = j_k->k;
-                               i->old_v = &j_k->v;
-                       }
-               }
-       }
-
-       __btree_path_get(i->path, true);
-
-       /*
-        * If a key is present in the key cache, it must also exist in the
-        * btree - this is necessary for cache coherency. When iterating over
-        * a btree that's cached in the key cache, the btree iter code checks
-        * the key cache - but the key has to exist in the btree for that to
-        * work:
-        */
-       if (path->cached && bkey_deleted(&i->old_k))
-               return flush_new_cached_update(trans, path, i, flags, ip);
-
-       return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-                                  struct bkey_i *k, enum btree_update_flags flags)
-{
-       struct btree_path *path = iter->update_path ?: iter->path;
-       struct bkey_cached *ck;
-       int ret;
-
-       if (iter->flags & BTREE_ITER_IS_EXTENTS)
-               return bch2_trans_update_extent(trans, iter, k, flags);
-
-       if (bkey_deleted(&k->k) &&
-           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-               if (unlikely(ret < 0))
-                       return ret;
-
-               if (ret)
-                       k->k.type = KEY_TYPE_whiteout;
-       }
-
-       /*
-        * Ensure that updates to cached btrees go to the key cache:
-        */
-       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-           !path->cached &&
-           !path->level &&
-           btree_id_cached(trans->c, path->btree_id)) {
-               if (!iter->key_cache_path ||
-                   !iter->key_cache_path->should_be_locked ||
-                   !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-                       if (!iter->key_cache_path)
-                               iter->key_cache_path =
-                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                                     BTREE_ITER_INTENT|
-                                                     BTREE_ITER_CACHED, _THIS_IP_);
-
-                       iter->key_cache_path =
-                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-                                                       iter->flags & BTREE_ITER_INTENT,
-                                                       _THIS_IP_);
-
-                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-                                                      BTREE_ITER_CACHED);
-                       if (unlikely(ret))
-                               return ret;
-
-                       ck = (void *) iter->key_cache_path->l[0].b;
-
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                               trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-                               return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-                       }
-
-                       btree_path_set_should_be_locked(iter->key_cache_path);
-               }
-
-               path = iter->key_cache_path;
-       }
-
-       return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
-}
-
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
-                                      struct btree_iter *iter, struct bkey_i *k,
-                                      enum btree_update_flags flags)
-{
-       trans->journal_res.seq = seq;
-       return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
-                                                BTREE_UPDATE_PREJOURNAL);
-}
-
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-                                           enum btree_id btree,
-                                           struct bkey_i *k)
-{
-       struct btree_write_buffered_key *i;
-       int ret;
-
-       EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
-       EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
-       trans_for_each_wb_update(trans, i) {
-               if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
-                       bkey_copy(&i->k, k);
-                       return 0;
-               }
-       }
-
-       if (!trans->wb_updates ||
-           trans->nr_wb_updates == trans->wb_updates_size) {
-               struct btree_write_buffered_key *u;
-
-               if (trans->nr_wb_updates == trans->wb_updates_size) {
-                       struct btree_transaction_stats *s = btree_trans_stats(trans);
-
-                       BUG_ON(trans->wb_updates_size > U8_MAX / 2);
-                       trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
-                       if (s)
-                               s->wb_updates_size = trans->wb_updates_size;
-               }
-
-               u = bch2_trans_kmalloc_nomemzero(trans,
-                                       trans->wb_updates_size *
-                                       sizeof(struct btree_write_buffered_key));
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       return ret;
-
-               if (trans->nr_wb_updates)
-                       memcpy(u, trans->wb_updates, trans->nr_wb_updates *
-                              sizeof(struct btree_write_buffered_key));
-               trans->wb_updates = u;
-       }
-
-       trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
-               .btree  = btree,
-       };
-
-       bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
-       trans->nr_wb_updates++;
-
-       return 0;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-                            enum btree_id btree, struct bpos end)
-{
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
-       k = bch2_btree_iter_prev(iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       bch2_btree_iter_advance(iter);
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-       if (bkey_gt(k.k->p, end)) {
-               ret = -BCH_ERR_ENOSPC_btree_slot;
-               goto err;
-       }
-
-       return 0;
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-                           struct btree_trans_commit_hook *h)
-{
-       h->next = trans->hooks;
-       trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-                               enum btree_id btree, struct bkey_i *k,
-                               enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-                       struct bkey_i *k, enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:                 pointer to struct bch_fs
- * @id:                        btree to insert into
- * @insert_keys:       list of keys to insert
- * @hook:              insert callback
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-                     struct bkey_i *k,
-                     struct disk_reservation *disk_res,
-                     u64 *journal_seq, int flags)
-{
-       return bch2_trans_do(c, disk_res, journal_seq, flags,
-                            __bch2_btree_insert(&trans, id, k, 0));
-}
-
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-                               unsigned len, unsigned update_flags)
-{
-       struct bkey_i *k;
-
-       k = bch2_trans_kmalloc(trans, sizeof(*k));
-       if (IS_ERR(k))
-               return PTR_ERR(k);
-
-       bkey_init(&k->k);
-       k->k.p = iter->pos;
-       bch2_key_resize(&k->k, len);
-       return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned update_flags)
-{
-       return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-                                 enum btree_id btree, struct bpos pos)
-{
-       struct bkey_i *k;
-
-       k = bch2_trans_kmalloc(trans, sizeof(*k));
-       if (IS_ERR(k))
-               return PTR_ERR(k);
-
-       bkey_init(&k->k);
-       k->k.p = pos;
-       return bch2_trans_update_buffered(trans, btree, k);
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-                                 struct bpos start, struct bpos end,
-                                 unsigned update_flags,
-                                 u64 *journal_seq)
-{
-       u32 restart_count = trans->restart_count;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-       while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(trans->c, 0);
-               struct bkey_i delete;
-
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               bkey_init(&delete.k);
-
-               /*
-                * This could probably be more efficient for extents:
-                */
-
-               /*
-                * For extents, iter.pos won't necessarily be the same as
-                * bkey_start_pos(k.k) (for non extents they always will be the
-                * same). It's important that we delete starting from iter.pos
-                * because the range we want to delete could start in the middle
-                * of k.
-                *
-                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-                * bkey_start_pos(k.k)).
-                */
-               delete.k.p = iter.pos;
-
-               if (iter.flags & BTREE_ITER_IS_EXTENTS)
-                       bch2_key_resize(&delete.k,
-                                       bpos_min(end, k.k->p).offset -
-                                       iter.pos.offset);
-
-               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-                       bch2_trans_commit(trans, &disk_res, journal_seq,
-                                         BTREE_INSERT_NOFAIL);
-               bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-               /*
-                * the bch2_trans_begin() call is in a weird place because we
-                * need to call it after every transaction commit, to avoid path
-                * overflow, but don't want to call it if the delete operation
-                * is a no-op and we have no work to do:
-                */
-               bch2_trans_begin(trans);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-                           struct bpos start, struct bpos end,
-                           unsigned update_flags,
-                           u64 *journal_seq)
-{
-       int ret = bch2_trans_run(c,
-                       bch2_btree_delete_range_trans(&trans, id, start, end,
-                                                     update_flags, journal_seq));
-       if (ret == -BCH_ERR_transaction_restart_nested)
-               ret = 0;
-       return ret;
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-                      struct bpos pos, bool set)
-{
-       struct bkey_i *k;
-       int ret = 0;
-
-       k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-       ret = PTR_ERR_OR_ZERO(k);
-       if (unlikely(ret))
-               return ret;
-
-       bkey_init(&k->k);
-       k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-       k->k.p = pos;
-
-       return bch2_trans_update_buffered(trans, btree, k);
-}
-
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
-{
-       struct printbuf buf = PRINTBUF;
-       struct jset_entry_log *l;
-       unsigned u64s;
-       int ret;
-
-       prt_vprintf(&buf, fmt, args);
-       ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-       if (ret)
-               goto err;
-
-       u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
-       ret = darray_make_room(entries, jset_u64s(u64s));
-       if (ret)
-               goto err;
-
-       l = (void *) &darray_top(*entries);
-       l->entry.u64s           = cpu_to_le16(u64s);
-       l->entry.btree_id       = 0;
-       l->entry.level          = 1;
-       l->entry.type           = BCH_JSET_ENTRY_log;
-       l->entry.pad[0]         = 0;
-       l->entry.pad[1]         = 0;
-       l->entry.pad[2]         = 0;
-       memcpy(l->d, buf.buf, buf.pos);
-       while (buf.pos & 7)
-               l->d[buf.pos++] = '\0';
-
-       entries->nr += jset_u64s(u64s);
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-                 va_list args)
-{
-       int ret;
-
-       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-       } else {
-               ret = bch2_trans_do(c, NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|commit_flags,
-                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-       }
-
-       return ret;
-}
-
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, 0, fmt, args);
-       va_end(args);
-       return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-       va_end(args);
-       return ret;
-}
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
new file mode 100644 (file)
index 0000000..612fba6
--- /dev/null
@@ -0,0 +1,943 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "subvolume.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+                                        const struct btree_insert_entry *r)
+{
+       return   cmp_int(l->btree_id,   r->btree_id) ?:
+                cmp_int(l->cached,     r->cached) ?:
+                -cmp_int(l->level,     r->level) ?:
+                bpos_cmp(l->k->k.p,    r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                         struct bkey_i *, enum btree_update_flags,
+                         unsigned long ip);
+
+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, id, pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (!k.k)
+                       break;
+
+               if (!bkey_eq(pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
+{
+       if (!btree_type_has_snapshots(id) ||
+           bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+               return 0;
+
+       return __check_pos_snapshot_overwritten(trans, id, pos);
+}
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      struct bkey_s_c k,
+                                      struct bkey_i **insert,
+                                      enum btree_update_flags flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *update;
+       int ret;
+
+       update = bch2_bkey_make_mut_noupdate(trans, k);
+       ret = PTR_ERR_OR_ZERO(update);
+       if (ret)
+               return ret;
+
+       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+               return 0;
+
+       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
+               check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       ret = bch2_btree_delete_at(trans, iter, flags);
+       if (ret)
+               return ret;
+
+       *insert = update;
+       return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+                                     struct btree_iter *iter,
+                                     struct bkey_i *insert,
+                                     struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       int ret;
+
+       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
+               check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+                                     enum btree_id btree_id, struct bpos pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot = pos.snapshot;
+       int ret;
+
+       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+               return 0;
+
+       pos.snapshot++;
+
+       for_each_btree_key_norestart(trans, iter, btree_id, pos,
+                          BTREE_ITER_ALL_SNAPSHOTS|
+                          BTREE_ITER_NOPRESERVE, k, ret) {
+               if (!bkey_eq(k.k->p, pos))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+                                             k.k->p.snapshot)) {
+                       ret = !bkey_whiteout(k.k);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+                                  enum btree_id id,
+                                  struct bpos old_pos,
+                                  struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter old_iter, new_iter = { NULL };
+       struct bkey_s_c old_k, new_k;
+       snapshot_id_list s;
+       struct bkey_i *update;
+       int ret;
+
+       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+               return 0;
+
+       darray_init(&s);
+
+       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+              !(ret = bkey_err(old_k)) &&
+              bkey_eq(old_pos, old_k.k->p)) {
+               struct bpos whiteout_pos =
+                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+                       continue;
+
+               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+                                          BTREE_ITER_NOT_EXTENTS|
+                                          BTREE_ITER_INTENT);
+               ret = bkey_err(new_k);
+               if (ret)
+                       break;
+
+               if (new_k.k->type == KEY_TYPE_deleted) {
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p             = whiteout_pos;
+                       update->k.type          = KEY_TYPE_whiteout;
+
+                       ret = bch2_trans_update(trans, &new_iter, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               }
+               bch2_trans_iter_exit(trans, &new_iter);
+
+               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &new_iter);
+       bch2_trans_iter_exit(trans, &old_iter);
+       darray_exit(&s);
+
+       return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      enum btree_update_flags flags,
+                                      struct bkey_s_c old,
+                                      struct bkey_s_c new)
+{
+       enum btree_id btree_id = iter->btree_id;
+       struct bkey_i *update;
+       struct bpos new_start = bkey_start_pos(new.k);
+       bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+       bool back_split  = bkey_gt(old.k->p, new.k->p);
+       int ret = 0, compressed_sectors;
+
+       /*
+        * If we're going to be splitting a compressed extent, note it
+        * so that __bch2_trans_commit() can increase our disk
+        * reservation:
+        */
+       if (((front_split && back_split) ||
+            ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
+           (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+               trans->extra_journal_res += compressed_sectors;
+
+       if (front_split) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_back(new_start, update);
+
+               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                       old.k->p, update->k.p) ?:
+                       bch2_btree_insert_nonextent(trans, btree_id, update,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       /* If we're overwriting in a different snapshot - middle split: */
+       if (old.k->p.snapshot != new.k->p.snapshot &&
+           (front_split || back_split)) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_front(new_start, update);
+               bch2_cut_back(new.k->p, update);
+
+               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                       old.k->p, update->k.p) ?:
+                       bch2_btree_insert_nonextent(trans, btree_id, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       if (bkey_le(old.k->p, new.k->p)) {
+               update = bch2_trans_kmalloc(trans, sizeof(*update));
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bkey_init(&update->k);
+               update->k.p = old.k->p;
+               update->k.p.snapshot = new.k->p.snapshot;
+
+               if (new.k->p.snapshot != old.k->p.snapshot) {
+                       update->k.type = KEY_TYPE_whiteout;
+               } else if (btree_type_has_snapshots(btree_id)) {
+                       ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+                       if (ret < 0)
+                               return ret;
+                       if (ret)
+                               update->k.type = KEY_TYPE_whiteout;
+               }
+
+               ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       if (back_split) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_front(new.k->p, update);
+
+               ret = bch2_trans_update_by_path(trans, iter->path, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                         flags, _RET_IP_);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+                                   struct btree_iter *orig_iter,
+                                   struct bkey_i *insert,
+                                   enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       enum btree_id btree_id = orig_iter->btree_id;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_NOT_EXTENTS);
+       k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+       if ((ret = bkey_err(k)))
+               goto err;
+       if (!k.k)
+               goto out;
+
+       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
+                       if (ret)
+                               goto err;
+               }
+
+               goto next;
+       }
+
+       while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+               bool done = bkey_lt(insert->k.p, k.k->p);
+
+               ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+               if (ret)
+                       goto err;
+
+               if (done)
+                       goto out;
+next:
+               bch2_btree_iter_advance(&iter);
+               k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+               if ((ret = bkey_err(k)))
+                       goto err;
+               if (!k.k)
+                       goto out;
+       }
+
+       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+               ret = extent_back_merge(trans, &iter, insert, k);
+               if (ret)
+                       goto err;
+       }
+out:
+       if (!bkey_deleted(&insert->k))
+               ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+                                           struct btree_path *path,
+                                           struct btree_insert_entry *i,
+                                           enum btree_update_flags flags,
+                                           unsigned long ip)
+{
+       struct btree_path *btree_path;
+       struct bkey k;
+       int ret;
+
+       btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                  BTREE_ITER_INTENT, _THIS_IP_);
+       ret = bch2_btree_path_traverse(trans, btree_path, 0);
+       if (ret)
+               goto out;
+
+       /*
+        * The old key in the insert entry might actually refer to an existing
+        * key in the btree that has been deleted from cache and not yet
+        * flushed. Check for this and skip the flush so we don't run triggers
+        * against a stale key.
+        */
+       bch2_btree_path_peek_slot_exact(btree_path, &k);
+       if (!bkey_deleted(&k))
+               goto out;
+
+       i->key_cache_already_flushed = true;
+       i->flags |= BTREE_TRIGGER_NORUN;
+
+       btree_path_set_should_be_locked(btree_path);
+       ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+       bch2_path_put(trans, btree_path, true);
+       return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags,
+                         unsigned long ip)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i, n;
+       u64 seq = 0;
+       int cmp;
+
+       EBUG_ON(!path->should_be_locked);
+       EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+       EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+       /*
+        * The transaction journal res hasn't been allocated at this point.
+        * That occurs at commit time. Reuse the seq field to pass in the seq
+        * of a prejournaled key.
+        */
+       if (flags & BTREE_UPDATE_PREJOURNAL)
+               seq = trans->journal_res.seq;
+
+       n = (struct btree_insert_entry) {
+               .flags          = flags,
+               .bkey_type      = __btree_node_type(path->level, path->btree_id),
+               .btree_id       = path->btree_id,
+               .level          = path->level,
+               .cached         = path->cached,
+               .path           = path,
+               .k              = k,
+               .seq            = seq,
+               .ip_allocated   = ip,
+       };
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans_for_each_update(trans, i)
+               BUG_ON(i != trans->updates &&
+                      btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+       /*
+        * Pending updates are kept sorted: first, find position of new update,
+        * then delete/trim any updates the new update overwrites:
+        */
+       trans_for_each_update(trans, i) {
+               cmp = btree_insert_entry_cmp(&n, i);
+               if (cmp <= 0)
+                       break;
+       }
+
+       if (!cmp && i < trans->updates + trans->nr_updates) {
+               EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+               bch2_path_put(trans, i->path, true);
+               i->flags        = n.flags;
+               i->cached       = n.cached;
+               i->k            = n.k;
+               i->path         = n.path;
+               i->seq          = n.seq;
+               i->ip_allocated = n.ip_allocated;
+       } else {
+               array_insert_item(trans->updates, trans->nr_updates,
+                                 i - trans->updates, n);
+
+               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+               if (unlikely(trans->journal_replay_not_finished)) {
+                       struct bkey_i *j_k =
+                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+                       if (j_k) {
+                               i->old_k = j_k->k;
+                               i->old_v = &j_k->v;
+                       }
+               }
+       }
+
+       __btree_path_get(i->path, true);
+
+       /*
+        * If a key is present in the key cache, it must also exist in the
+        * btree - this is necessary for cache coherency. When iterating over
+        * a btree that's cached in the key cache, the btree iter code checks
+        * the key cache - but the key has to exist in the btree for that to
+        * work:
+        */
+       if (path->cached && bkey_deleted(&i->old_k))
+               return flush_new_cached_update(trans, path, i, flags, ip);
+
+       return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+                                  struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_path *path = iter->update_path ?: iter->path;
+       struct bkey_cached *ck;
+       int ret;
+
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return bch2_trans_update_extent(trans, iter, k, flags);
+
+       if (bkey_deleted(&k->k) &&
+           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               if (unlikely(ret < 0))
+                       return ret;
+
+               if (ret)
+                       k->k.type = KEY_TYPE_whiteout;
+       }
+
+       /*
+        * Ensure that updates to cached btrees go to the key cache:
+        */
+       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           !path->cached &&
+           !path->level &&
+           btree_id_cached(trans->c, path->btree_id)) {
+               if (!iter->key_cache_path ||
+                   !iter->key_cache_path->should_be_locked ||
+                   !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
+                       if (!iter->key_cache_path)
+                               iter->key_cache_path =
+                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                                     BTREE_ITER_INTENT|
+                                                     BTREE_ITER_CACHED, _THIS_IP_);
+
+                       iter->key_cache_path =
+                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                                       iter->flags & BTREE_ITER_INTENT,
+                                                       _THIS_IP_);
+
+                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                                      BTREE_ITER_CACHED);
+                       if (unlikely(ret))
+                               return ret;
+
+                       ck = (void *) iter->key_cache_path->l[0].b;
+
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                               trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+                               return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+                       }
+
+                       btree_path_set_should_be_locked(iter->key_cache_path);
+               }
+
+               path = iter->key_cache_path;
+       }
+
+       return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+/*
+ * Add a transaction update for a key that has already been journaled.
+ */
+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
+                                      struct btree_iter *iter, struct bkey_i *k,
+                                      enum btree_update_flags flags)
+{
+       trans->journal_res.seq = seq;
+       return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
+                                                BTREE_UPDATE_PREJOURNAL);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+                                           enum btree_id btree,
+                                           struct bkey_i *k)
+{
+       struct btree_write_buffered_key *i;
+       int ret;
+
+       EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+       EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+       trans_for_each_wb_update(trans, i) {
+               if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+                       bkey_copy(&i->k, k);
+                       return 0;
+               }
+       }
+
+       if (!trans->wb_updates ||
+           trans->nr_wb_updates == trans->wb_updates_size) {
+               struct btree_write_buffered_key *u;
+
+               if (trans->nr_wb_updates == trans->wb_updates_size) {
+                       struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+                       BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+                       trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+                       if (s)
+                               s->wb_updates_size = trans->wb_updates_size;
+               }
+
+               u = bch2_trans_kmalloc_nomemzero(trans,
+                                       trans->wb_updates_size *
+                                       sizeof(struct btree_write_buffered_key));
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       return ret;
+
+               if (trans->nr_wb_updates)
+                       memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+                              sizeof(struct btree_write_buffered_key));
+               trans->wb_updates = u;
+       }
+
+       trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+               .btree  = btree,
+       };
+
+       bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+       trans->nr_wb_updates++;
+
+       return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+                            enum btree_id btree, struct bpos end)
+{
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_prev(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_advance(iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+       if (bkey_gt(k.k->p, end)) {
+               ret = -BCH_ERR_ENOSPC_btree_slot;
+               goto err;
+       }
+
+       return 0;
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+                           struct btree_trans_commit_hook *h)
+{
+       h->next = trans->hooks;
+       trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+                               enum btree_id btree, struct bkey_i *k,
+                               enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, flags);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
+                       struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, flags);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:                 pointer to struct bch_fs
+ * @id:                        btree to insert into
+ * @insert_keys:       list of keys to insert
+ * @hook:              insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+                     struct bkey_i *k,
+                     struct disk_reservation *disk_res,
+                     u64 *journal_seq, int flags)
+{
+       return bch2_trans_do(c, disk_res, journal_seq, flags,
+                            __bch2_btree_insert(&trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+                               unsigned len, unsigned update_flags)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = iter->pos;
+       bch2_key_resize(&k->k, len);
+       return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+                        struct btree_iter *iter, unsigned update_flags)
+{
+       return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+                                 enum btree_id btree, struct bpos pos)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = pos;
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+                                 struct bpos start, struct bpos end,
+                                 unsigned update_flags,
+                                 u64 *journal_seq)
+{
+       u32 restart_count = trans->restart_count;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+       while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(trans->c, 0);
+               struct bkey_i delete;
+
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               bkey_init(&delete.k);
+
+               /*
+                * This could probably be more efficient for extents:
+                */
+
+               /*
+                * For extents, iter.pos won't necessarily be the same as
+                * bkey_start_pos(k.k) (for non extents they always will be the
+                * same). It's important that we delete starting from iter.pos
+                * because the range we want to delete could start in the middle
+                * of k.
+                *
+                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+                * bkey_start_pos(k.k)).
+                */
+               delete.k.p = iter.pos;
+
+               if (iter.flags & BTREE_ITER_IS_EXTENTS)
+                       bch2_key_resize(&delete.k,
+                                       bpos_min(end, k.k->p).offset -
+                                       iter.pos.offset);
+
+               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+                       bch2_trans_commit(trans, &disk_res, journal_seq,
+                                         BTREE_INSERT_NOFAIL);
+               bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+               /*
+                * the bch2_trans_begin() call is in a weird place because we
+                * need to call it after every transaction commit, to avoid path
+                * overflow, but don't want to call it if the delete operation
+                * is a no-op and we have no work to do:
+                */
+               bch2_trans_begin(trans);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       ret = 0;
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (!ret && trans_was_restarted(trans, restart_count))
+               ret = -BCH_ERR_transaction_restart_nested;
+       return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+                           struct bpos start, struct bpos end,
+                           unsigned update_flags,
+                           u64 *journal_seq)
+{
+       int ret = bch2_trans_run(c,
+                       bch2_btree_delete_range_trans(&trans, id, start, end,
+                                                     update_flags, journal_seq));
+       if (ret == -BCH_ERR_transaction_restart_nested)
+               ret = 0;
+       return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+                      struct bpos pos, bool set)
+{
+       struct bkey_i *k;
+       int ret = 0;
+
+       k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+       ret = PTR_ERR_OR_ZERO(k);
+       if (unlikely(ret))
+               return ret;
+
+       bkey_init(&k->k);
+       k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       k->k.p = pos;
+
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+       struct printbuf buf = PRINTBUF;
+       struct jset_entry_log *l;
+       unsigned u64s;
+       int ret;
+
+       prt_vprintf(&buf, fmt, args);
+       ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+       if (ret)
+               goto err;
+
+       u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+       ret = darray_make_room(entries, jset_u64s(u64s));
+       if (ret)
+               goto err;
+
+       l = (void *) &darray_top(*entries);
+       l->entry.u64s           = cpu_to_le16(u64s);
+       l->entry.btree_id       = 0;
+       l->entry.level          = 1;
+       l->entry.type           = BCH_JSET_ENTRY_log;
+       l->entry.pad[0]         = 0;
+       l->entry.pad[1]         = 0;
+       l->entry.pad[2]         = 0;
+       memcpy(l->d, buf.buf, buf.pos);
+       while (buf.pos & 7)
+               l->d[buf.pos++] = '\0';
+
+       entries->nr += jset_u64s(u64s);
+err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+                 va_list args)
+{
+       int ret;
+
+       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+       } else {
+               ret = bch2_trans_do(c, NULL, NULL,
+                       BTREE_INSERT_LAZY_RW|commit_flags,
+                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
+       }
+
+       return ret;
+}
+
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, 0, fmt, args);
+       va_end(args);
+       return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+       va_end(args);
+       return ret;
+}
index f42ef46c59df6f423b524d4d0056e89af72f041b..986dd541435a0fcb4ef6ccacce44857147b98d93 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -17,7 +18,6 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
index a418f664896de662c4d5653fde5b1123e036219f..f192809f50cf040fe0129f00c7c8dc011356ef7c 100644 (file)
 
 #include "buckets_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+       return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+       return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+       u32 remainder;
+
+       div_u64_rem(s, ca->mi.bucket_size, &remainder);
+       return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+                                                u32 *offset)
+{
+       return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
 
 #define for_each_bucket(_b, _buckets)                          \
        for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
@@ -292,6 +316,27 @@ int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
                                    size_t, enum bch_data_type, unsigned);
 int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       u64 b_offset    = bucket_to_sector(ca, b);
+       u64 b_end       = bucket_to_sector(ca, b + 1);
+       unsigned i;
+
+       if (!b)
+               return true;
+
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+               u64 end = offset + (1 << layout->sb_max_size_bits);
+
+               if (!(offset >= b_end || end <= b_offset))
+                       return true;
+       }
+
+       return false;
+}
+
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
index a08997a5bb67566f27abbcb5e7a85b61d8c48b19..a0ef85eb52a4a90e7e2115dd5f1a66d60168cc79 100644 (file)
@@ -360,7 +360,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 
        state.type = type;
        bch2_checksum_init(&state);
-       state.seed = a.lo;
+       state.seed = (u64 __force) a.lo;
 
        BUG_ON(!bch2_checksum_mergeable(type));
 
@@ -371,7 +371,7 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
                                page_address(ZERO_PAGE(0)), b);
                b_len -= b;
        }
-       a.lo = bch2_checksum_final(&state);
+       a.lo = (__le64 __force) bch2_checksum_final(&state);
        a.lo ^= b.lo;
        a.hi ^= b.hi;
        return a;
@@ -426,7 +426,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
                merged = bch2_checksum_bio(c, crc_old.csum_type,
                                extent_nonce(version, crc_old), bio);
 
-       if (bch2_crc_cmp(merged, crc_old.csum)) {
+       if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
                bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
                        "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
                        crc_old.csum.hi,
@@ -458,6 +458,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
        return 0;
 }
 
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+               prt_printf(err, "wrong size (got %zu should be %zu)",
+                      vstruct_bytes(&crypt->field), sizeof(*crypt));
+               return -BCH_ERR_invalid_sb_crypt;
+       }
+
+       if (BCH_CRYPT_KDF_TYPE(crypt)) {
+               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+               return -BCH_ERR_invalid_sb_crypt;
+       }
+
+       return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+       prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+       .validate       = bch2_sb_crypt_validate,
+       .to_text        = bch2_sb_crypt_to_text,
+};
+
 #ifdef __KERNEL__
 static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
@@ -597,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
        if (ret)
                goto out;
 
-       crypt->key.magic        = BCH_KEY_MAGIC;
+       crypt->key.magic        = cpu_to_le64(BCH_KEY_MAGIC);
        crypt->key.key          = key;
 
        SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
@@ -625,7 +667,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
        if (ret)
                goto err;
 
-       key.magic = BCH_KEY_MAGIC;
+       key.magic = cpu_to_le64(BCH_KEY_MAGIC);
        get_random_bytes(&key.key, sizeof(key.key));
 
        if (keyed) {
index 1ad1d5f03939ce12e3d7469f64db634450ba9f2c..c7b1a8fca6850dd91e1ed4432683246d5dbc2fa8 100644 (file)
@@ -72,6 +72,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
                : 0;
 }
 
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
                        struct bch_key *);
 
index de14ca3a9895ddc2d756776252fde5dd0480fd03..f36472c4a78187ae6afe61a1f30114d2b32c2b2a 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "disk_groups.h"
+#include "sb-members.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
index 735eb24161139931ca1437d707389fe767b073ac..f7fa87442e98b849d12d140ea159a36c32ab2eb6 100644 (file)
        x(BCH_ERR_invalid_sb,           invalid_sb_quota)                       \
        x(BCH_ERR_invalid,              invalid_bkey)                           \
        x(BCH_ERR_operation_blocked,    nocow_lock_blocked)                     \
+       x(EIO,                          btree_node_read_err)                    \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_fixable)            \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_want_retry)         \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_must_retry)         \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_bad_node)           \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_incompatible)
 
 enum bch_errcode {
        BCH_ERR_START           = 2048,
index c13e0afc66eaa4430bc58b68960c22ea5fd4de5e..7a3f42f3bc5bfa8c61d6b35f6092cb91ca4a7add 100644 (file)
@@ -517,13 +517,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
        switch (type) {
        case BCH_EXTENT_ENTRY_crc32:
                set_common_fields(dst->crc32, src);
-               memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum));
+               dst->crc32.csum         = (u32 __force) *((__le32 *) &src.csum.lo);
                break;
        case BCH_EXTENT_ENTRY_crc64:
                set_common_fields(dst->crc64, src);
                dst->crc64.nonce        = src.nonce;
-               dst->crc64.csum_lo      = src.csum.lo;
-               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
+               dst->crc64.csum_lo      = (u64 __force) src.csum.lo;
+               dst->crc64.csum_hi      = (u64 __force) *((__le16 *) &src.csum.hi);
                break;
        case BCH_EXTENT_ENTRY_crc128:
                set_common_fields(dst->crc128, src);
index 6e9d23a06758685fd90372df237d8d83c8119b4c..7ee8d031bb6c50c809114107dbf79c7aadc17bf7 100644 (file)
@@ -155,7 +155,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
                        common_fields(crc->crc32),
                };
 
-               memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum));
+               *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
                return ret;
        }
        case BCH_EXTENT_ENTRY_crc64: {
@@ -165,8 +165,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
                        .csum.lo                = (__force __le64) crc->crc64.csum_lo,
                };
 
-               u16 hi = crc->crc64.csum_hi;
-               memcpy(&ret.csum.hi, &hi, sizeof(hi));
+               *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
                return ret;
        }
        case BCH_EXTENT_ENTRY_crc128: {
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
new file mode 100644 (file)
index 0000000..f6c8d21
--- /dev/null
@@ -0,0 +1,1102 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return true;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return true;
+       return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+       struct folio_iter fi;
+
+       bio_for_each_folio_all(fi, bio) {
+               if (!bio->bi_status) {
+                       folio_mark_uptodate(fi.folio);
+               } else {
+                       folio_clear_uptodate(fi.folio);
+                       folio_set_error(fi.folio);
+               }
+               folio_unlock(fi.folio);
+       }
+
+       bio_put(bio);
+}
+
+struct readpages_iter {
+       struct address_space    *mapping;
+       unsigned                idx;
+       folios                  folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+                              struct readahead_control *ractl)
+{
+       struct folio **fi;
+       int ret;
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->mapping = ractl->mapping;
+
+       ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+                               ractl->_index << PAGE_SHIFT,
+                               (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+                               0, mapping_gfp_mask(iter->mapping),
+                               &iter->folios);
+       if (ret)
+               return ret;
+
+       darray_for_each(iter->folios, fi) {
+               ractl->_nr_pages -= 1U << folio_order(*fi);
+               __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+               folio_put(*fi);
+               folio_put(*fi);
+       }
+
+       return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+       if (iter->idx >= iter->folios.nr)
+               return NULL;
+       return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+       iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       bkey_for_each_crc(k.k, ptrs, crc, i)
+               if (crc.csum_type || crc.compression_type)
+                       return true;
+       return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+                              struct readpages_iter *iter,
+                              struct bio *bio,
+                              unsigned sectors_this_extent,
+                              bool get_more)
+{
+       /* Don't hold btree locks while allocating memory: */
+       bch2_trans_unlock(trans);
+
+       while (bio_sectors(bio) < sectors_this_extent &&
+              bio->bi_vcnt < bio->bi_max_vecs) {
+               struct folio *folio = readpage_iter_peek(iter);
+               int ret;
+
+               if (folio) {
+                       readpage_iter_advance(iter);
+               } else {
+                       pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+                       if (!get_more)
+                               break;
+
+                       folio = xa_load(&iter->mapping->i_pages, folio_offset);
+                       if (folio && !xa_is_value(folio))
+                               break;
+
+                       folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+                       if (!folio)
+                               break;
+
+                       if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+                               folio_put(folio);
+                               break;
+                       }
+
+                       ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+                       if (ret) {
+                               __bch2_folio_release(folio);
+                               folio_put(folio);
+                               break;
+                       }
+
+                       folio_put(folio);
+               }
+
+               BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+               BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+       }
+
+       return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+                      struct bch_read_bio *rbio,
+                      subvol_inum inum,
+                      struct readpages_iter *readpages_iter)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       int flags = BCH_READ_RETRY_IF_STALE|
+               BCH_READ_MAY_PROMOTE;
+       u32 snapshot;
+       int ret = 0;
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
+
+       bch2_bkey_buf_init(&sk);
+retry:
+       bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
+       while (1) {
+               struct bkey_s_c k;
+               unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
+
+               /*
+                * read_extent -> io_time_reset may cause a transaction restart
+                * without returning an error, we need to check for that here:
+                */
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       break;
+
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               offset_into_extent = iter.pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               bch2_bkey_buf_reassemble(&sk, c, k);
+
+               ret = bch2_read_indirect_extent(trans, &data_btree,
+                                       &offset_into_extent, &sk);
+               if (ret)
+                       break;
+
+               k = bkey_i_to_s_c(sk.k);
+
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               if (readpages_iter) {
+                       ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+                                                 extent_partial_reads_expensive(k));
+                       if (ret)
+                               break;
+               }
+
+               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+
+               if (rbio->bio.bi_iter.bi_size == bytes)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               bch2_bio_page_state_set(&rbio->bio, k);
+
+               bch2_read_extent(trans, rbio, iter.pos,
+                                data_btree, k, offset_into_extent, flags);
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       break;
+
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+               bio_advance(&rbio->bio, bytes);
+
+               ret = btree_trans_too_many_iters(trans);
+               if (ret)
+                       break;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+
+       if (ret) {
+               bch_err_inum_offset_ratelimited(c,
+                               iter.pos.inode,
+                               iter.pos.offset << 9,
+                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
+               bio_endio(&rbio->bio);
+       }
+
+       bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts;
+       struct btree_trans trans;
+       struct folio *folio;
+       struct readpages_iter readpages_iter;
+       int ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       ret = readpages_iter_init(&readpages_iter, ractl);
+       BUG_ON(ret);
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       bch2_pagecache_add_get(inode);
+
+       while ((folio = readpage_iter_peek(&readpages_iter))) {
+               unsigned n = min_t(unsigned,
+                                  readpages_iter.folios.nr -
+                                  readpages_iter.idx,
+                                  BIO_MAX_VECS);
+               struct bch_read_bio *rbio =
+                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+                                                  GFP_KERNEL, &c->bio_read),
+                                 opts);
+
+               readpage_iter_advance(&readpages_iter);
+
+               rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+               rbio->bio.bi_end_io = bch2_readpages_end_io;
+               BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+               bchfs_read(&trans, rbio, inode_inum(inode),
+                          &readpages_iter);
+               bch2_trans_unlock(&trans);
+       }
+
+       bch2_pagecache_add_put(inode);
+
+       bch2_trans_exit(&trans);
+       darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+                            subvol_inum inum, struct folio *folio)
+{
+       struct btree_trans trans;
+
+       bch2_folio_create(folio, __GFP_NOFAIL);
+
+       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+       rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+       BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+       bch2_trans_init(&trans, c, 0, 0);
+       bchfs_read(&trans, rbio, inum, NULL);
+       bch2_trans_exit(&trans);
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+       complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_read_bio *rbio;
+       struct bch_io_opts opts;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+                        opts);
+       rbio->bio.bi_private = &done;
+       rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+       __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+       wait_for_completion(&done);
+
+       ret = blk_status_to_errno(rbio->bio.bi_status);
+       bio_put(&rbio->bio);
+
+       if (ret < 0)
+               return ret;
+
+       folio_mark_uptodate(folio);
+       return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+       int ret;
+
+       ret = bch2_read_single_folio(folio, folio->mapping);
+       folio_unlock(folio);
+       return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+       struct bch_inode_info           *inode;
+
+       /* must be last: */
+       struct bch_write_op             op;
+};
+
+struct bch_writepage_state {
+       struct bch_writepage_io *io;
+       struct bch_io_opts      opts;
+       struct bch_folio_sector *tmp;
+       unsigned                tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+                                                                 struct bch_inode_info *inode)
+{
+       struct bch_writepage_state ret = { 0 };
+
+       bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+       return ret;
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+       struct bch_writepage_io *io =
+               container_of(op, struct bch_writepage_io, op);
+       struct bch_fs *c = io->op.c;
+       struct bio *bio = &io->op.wbio.bio;
+       struct folio_iter fi;
+       unsigned i;
+
+       if (io->op.error) {
+               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+               bio_for_each_folio_all(fi, bio) {
+                       struct bch_folio *s;
+
+                       folio_set_error(fi.folio);
+                       mapping_set_error(fi.folio->mapping, -EIO);
+
+                       s = __bch2_folio(fi.folio);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < folio_sectors(fi.folio); i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
+       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+               bio_for_each_folio_all(fi, bio) {
+                       struct bch_folio *s;
+
+                       s = __bch2_folio(fi.folio);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < folio_sectors(fi.folio); i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
+       /*
+        * racing with fallocate can cause us to add fewer sectors than
+        * expected - but we shouldn't add more sectors than expected:
+        */
+       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+       /*
+        * (error (due to going RO) halfway through a page can screw that up
+        * slightly)
+        * XXX wtf?
+          BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+        */
+
+       /*
+        * PageWriteback is effectively our ref on the inode - fixup i_blocks
+        * before calling end_page_writeback:
+        */
+       bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+       bio_for_each_folio_all(fi, bio) {
+               struct bch_folio *s = __bch2_folio(fi.folio);
+
+               if (atomic_dec_and_test(&s->write_count))
+                       folio_end_writeback(fi.folio);
+       }
+
+       bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+       struct bch_writepage_io *io = w->io;
+
+       w->io = NULL;
+       closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+                                   struct writeback_control *wbc,
+                                   struct bch_writepage_state *w,
+                                   struct bch_inode_info *inode,
+                                   u64 sector,
+                                   unsigned nr_replicas)
+{
+       struct bch_write_op *op;
+
+       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+                                             REQ_OP_WRITE,
+                                             GFP_KERNEL,
+                                             &c->writepage_bioset),
+                            struct bch_writepage_io, op.wbio.bio);
+
+       w->io->inode            = inode;
+       op                      = &w->io->op;
+       bch2_write_op_init(op, c, w->opts);
+       op->target              = w->opts.foreground_target;
+       op->nr_replicas         = nr_replicas;
+       op->res.nr_replicas     = nr_replicas;
+       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->subvol              = inode->ei_subvol;
+       op->pos                 = POS(inode->v.i_ino, sector);
+       op->end_io              = bch2_writepage_io_done;
+       op->devs_need_flush     = &inode->ei_devs_need_flush;
+       op->wbio.bio.bi_iter.bi_sector = sector;
+       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+                           struct writeback_control *wbc,
+                           void *data)
+{
+       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_writepage_state *w = data;
+       struct bch_folio *s;
+       unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+       loff_t i_size = i_size_read(&inode->v);
+       int ret;
+
+       EBUG_ON(!folio_test_uptodate(folio));
+
+       /* Is the folio fully inside i_size? */
+       if (folio_end_pos(folio) <= i_size)
+               goto do_io;
+
+       /* Is the folio fully outside i_size? (truncate in progress) */
+       if (folio_pos(folio) >= i_size) {
+               folio_unlock(folio);
+               return 0;
+       }
+
+       /*
+        * The folio straddles i_size.  It must be zeroed out on each and every
+        * writepage invocation because it may be mmapped.  "A file is mapped
+        * in multiples of the folio size.  For a file that is not a multiple of
+        * the  folio size, the remaining memory is zeroed when mapped, and
+        * writes to that region are not written out to the file."
+        */
+       folio_zero_segment(folio,
+                          i_size - folio_pos(folio),
+                          folio_size(folio));
+do_io:
+       f_sectors = folio_sectors(folio);
+       s = bch2_folio(folio);
+
+       if (f_sectors > w->tmp_sectors) {
+               kfree(w->tmp);
+               w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
+                                f_sectors, __GFP_NOFAIL);
+               w->tmp_sectors = f_sectors;
+       }
+
+       /*
+        * Things get really hairy with errors during writeback:
+        */
+       ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+       BUG_ON(ret);
+
+       /* Before unlocking the page, get copy of reservations: */
+       spin_lock(&s->lock);
+       memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+       for (i = 0; i < f_sectors; i++) {
+               if (s->s[i].state < SECTOR_dirty)
+                       continue;
+
+               nr_replicas_this_write =
+                       min_t(unsigned, nr_replicas_this_write,
+                             s->s[i].nr_replicas +
+                             s->s[i].replicas_reserved);
+       }
+
+       for (i = 0; i < f_sectors; i++) {
+               if (s->s[i].state < SECTOR_dirty)
+                       continue;
+
+               s->s[i].nr_replicas = w->opts.compression
+                       ? 0 : nr_replicas_this_write;
+
+               s->s[i].replicas_reserved = 0;
+               bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+       }
+       spin_unlock(&s->lock);
+
+       BUG_ON(atomic_read(&s->write_count));
+       atomic_set(&s->write_count, 1);
+
+       BUG_ON(folio_test_writeback(folio));
+       folio_start_writeback(folio);
+
+       folio_unlock(folio);
+
+       offset = 0;
+       while (1) {
+               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+               u64 sector;
+
+               while (offset < f_sectors &&
+                      w->tmp[offset].state < SECTOR_dirty)
+                       offset++;
+
+               if (offset == f_sectors)
+                       break;
+
+               while (offset + sectors < f_sectors &&
+                      w->tmp[offset + sectors].state >= SECTOR_dirty) {
+                       reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+                       dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+                       sectors++;
+               }
+               BUG_ON(!sectors);
+
+               sector = folio_sector(folio) + offset;
+
+               if (w->io &&
+                   (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+                    bio_full(&w->io->op.wbio.bio, sectors << 9) ||
+                    w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
+                    (BIO_MAX_VECS * PAGE_SIZE) ||
+                    bio_end_sector(&w->io->op.wbio.bio) != sector))
+                       bch2_writepage_do_io(w);
+
+               if (!w->io)
+                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+                                               nr_replicas_this_write);
+
+               atomic_inc(&s->write_count);
+
+               BUG_ON(inode != w->io->inode);
+               BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+                                    sectors << 9, offset << 9));
+
+               /* Check for writing past i_size: */
+               WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+                         round_up(i_size, block_bytes(c)) &&
+                         !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+                         "writing past i_size: %llu > %llu (unrounded %llu)\n",
+                         bio_end_sector(&w->io->op.wbio.bio) << 9,
+                         round_up(i_size, block_bytes(c)),
+                         i_size);
+
+               w->io->op.res.sectors += reserved_sectors;
+               w->io->op.i_sectors_delta -= dirty_sectors;
+               w->io->op.new_i_size = i_size;
+
+               offset += sectors;
+       }
+
+       if (atomic_dec_and_test(&s->write_count))
+               folio_end_writeback(folio);
+
+       return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(mapping->host));
+       struct blk_plug plug;
+       int ret;
+
+       blk_start_plug(&plug);
+       ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+       if (w.io)
+               bch2_writepage_do_io(&w);
+       blk_finish_plug(&plug);
+       kfree(w.tmp);
+       return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+                    loff_t pos, unsigned len,
+                    struct page **pagep, void **fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation *res;
+       struct folio *folio;
+       unsigned offset;
+       int ret = -ENOMEM;
+
+       res = kmalloc(sizeof(*res), GFP_KERNEL);
+       if (!res)
+               return -ENOMEM;
+
+       bch2_folio_reservation_init(c, inode, res);
+       *fsdata = res;
+
+       bch2_pagecache_add_get(inode);
+
+       folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+                               FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+                               mapping_gfp_mask(mapping));
+       if (IS_ERR_OR_NULL(folio))
+               goto err_unlock;
+
+       if (folio_test_uptodate(folio))
+               goto out;
+
+       offset = pos - folio_pos(folio);
+       len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+       /* If we're writing entire folio, don't need to read it in first: */
+       if (!offset && len == folio_size(folio))
+               goto out;
+
+       if (!offset && pos + len >= inode->v.i_size) {
+               folio_zero_segment(folio, len, folio_size(folio));
+               flush_dcache_folio(folio);
+               goto out;
+       }
+
+       if (folio_pos(folio) >= inode->v.i_size) {
+               folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+               flush_dcache_folio(folio);
+               goto out;
+       }
+readpage:
+       ret = bch2_read_single_folio(folio, mapping);
+       if (ret)
+               goto err;
+out:
+       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+       if (ret)
+               goto err;
+
+       ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+       if (ret) {
+               if (!folio_test_uptodate(folio)) {
+                       /*
+                        * If the folio hasn't been read in, we won't know if we
+                        * actually need a reservation - we don't actually need
+                        * to read here, we just need to check if the folio is
+                        * fully backed by uncompressed data:
+                        */
+                       goto readpage;
+               }
+
+               goto err;
+       }
+
+       *pagep = &folio->page;
+       return 0;
+err:
+       folio_unlock(folio);
+       folio_put(folio);
+       *pagep = NULL;
+err_unlock:
+       bch2_pagecache_add_put(inode);
+       kfree(res);
+       *fsdata = NULL;
+       return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+                  loff_t pos, unsigned len, unsigned copied,
+                  struct page *page, void *fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation *res = fsdata;
+       struct folio *folio = page_folio(page);
+       unsigned offset = pos - folio_pos(folio);
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+       BUG_ON(offset + copied > folio_size(folio));
+
+       if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+               /*
+                * The folio needs to be read in, but that would destroy
+                * our partial write - simplest thing is to just force
+                * userspace to redo the write:
+                */
+               folio_zero_range(folio, 0, folio_size(folio));
+               flush_dcache_folio(folio);
+               copied = 0;
+       }
+
+       spin_lock(&inode->v.i_lock);
+       if (pos + copied > inode->v.i_size)
+               i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
+
+       if (copied) {
+               if (!folio_test_uptodate(folio))
+                       folio_mark_uptodate(folio);
+
+               bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+               inode->ei_last_dirtied = (unsigned long) current;
+       }
+
+       folio_unlock(folio);
+       folio_put(folio);
+       bch2_pagecache_add_put(inode);
+
+       bch2_folio_reservation_put(c, inode, res);
+       kfree(res);
+
+       return copied;
+}
+
+static noinline void folios_trunc(folios *folios, struct folio **fi)
+{
+       while (folios->data + folios->nr > fi) {
+               struct folio *f = darray_pop(folios);
+
+               folio_unlock(f);
+               folio_put(f);
+       }
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+                                struct address_space *mapping,
+                                struct iov_iter *iter,
+                                loff_t pos, unsigned len)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation res;
+       folios folios;
+       struct folio **fi, *f;
+       unsigned copied = 0, f_offset;
+       u64 end = pos + len, f_pos;
+       loff_t last_folio_pos = inode->v.i_size;
+       int ret = 0;
+
+       BUG_ON(!len);
+
+       bch2_folio_reservation_init(c, inode, &res);
+       darray_init(&folios);
+
+       ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+                                  FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+                                  mapping_gfp_mask(mapping),
+                                  &folios);
+       if (ret)
+               goto out;
+
+       BUG_ON(!folios.nr);
+
+       f = darray_first(folios);
+       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+               ret = bch2_read_single_folio(f, mapping);
+               if (ret)
+                       goto out;
+       }
+
+       f = darray_last(folios);
+       end = min(end, folio_end_pos(f));
+       last_folio_pos = folio_pos(f);
+       if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+               if (end >= inode->v.i_size) {
+                       folio_zero_range(f, 0, folio_size(f));
+               } else {
+                       ret = bch2_read_single_folio(f, mapping);
+                       if (ret)
+                               goto out;
+               }
+       }
+
+       ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
+       if (ret)
+               goto out;
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(folios));
+       darray_for_each(folios, fi) {
+               struct folio *f = *fi;
+               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+
+               /*
+                * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+                * supposed to write as much as we have disk space for.
+                *
+                * On failure here we should still write out a partial page if
+                * we aren't completely out of disk space - we don't do that
+                * yet:
+                */
+               ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+               if (unlikely(ret)) {
+                       folios_trunc(&folios, fi);
+                       if (!folios.nr)
+                               goto out;
+
+                       end = min(end, folio_end_pos(darray_last(folios)));
+                       break;
+               }
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       if (mapping_writably_mapped(mapping))
+               darray_for_each(folios, fi)
+                       flush_dcache_folio(*fi);
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(folios));
+       darray_for_each(folios, fi) {
+               struct folio *f = *fi;
+               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+               unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+
+               if (!f_copied) {
+                       folios_trunc(&folios, fi);
+                       break;
+               }
+
+               if (!folio_test_uptodate(f) &&
+                   f_copied != folio_size(f) &&
+                   pos + copied + f_copied < inode->v.i_size) {
+                       folio_zero_range(f, 0, folio_size(f));
+                       folios_trunc(&folios, fi);
+                       break;
+               }
+
+               flush_dcache_folio(f);
+               copied += f_copied;
+
+               if (f_copied != f_len) {
+                       folios_trunc(&folios, fi + 1);
+                       break;
+               }
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       if (!copied)
+               goto out;
+
+       end = pos + copied;
+
+       spin_lock(&inode->v.i_lock);
+       if (end > inode->v.i_size)
+               i_size_write(&inode->v, end);
+       spin_unlock(&inode->v.i_lock);
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(folios));
+       darray_for_each(folios, fi) {
+               struct folio *f = *fi;
+               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
+
+               if (!folio_test_uptodate(f))
+                       folio_mark_uptodate(f);
+
+               bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       inode->ei_last_dirtied = (unsigned long) current;
+out:
+       darray_for_each(folios, fi) {
+               folio_unlock(*fi);
+               folio_put(*fi);
+       }
+
+       /*
+        * If the last folio added to the mapping starts beyond current EOF, we
+        * performed a short write but left around at least one post-EOF folio.
+        * Clean up the mapping before we return.
+        */
+       if (last_folio_pos >= inode->v.i_size)
+               truncate_pagecache(&inode->v, inode->v.i_size);
+
+       darray_exit(&folios);
+       bch2_folio_reservation_put(c, inode, &res);
+
+       return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       loff_t pos = iocb->ki_pos;
+       ssize_t written = 0;
+       int ret = 0;
+
+       bch2_pagecache_add_get(inode);
+
+       do {
+               unsigned offset = pos & (PAGE_SIZE - 1);
+               unsigned bytes = iov_iter_count(iter);
+again:
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                *
+                * Not only is this an optimisation, but it is also required
+                * to check that the address is actually valid, when atomic
+                * usercopies are used, below.
+                */
+               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                       bytes = min_t(unsigned long, iov_iter_count(iter),
+                                     PAGE_SIZE - offset);
+
+                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                               ret = -EFAULT;
+                               break;
+                       }
+               }
+
+               if (unlikely(fatal_signal_pending(current))) {
+                       ret = -EINTR;
+                       break;
+               }
+
+               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+               if (unlikely(ret < 0))
+                       break;
+
+               cond_resched();
+
+               if (unlikely(ret == 0)) {
+                       /*
+                        * If we were unable to copy any data at all, we must
+                        * fall back to a single segment length write.
+                        *
+                        * If we didn't fallback here, we could livelock
+                        * because not all segments in the iov can be copied at
+                        * once without a pagefault.
+                        */
+                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                     iov_iter_single_seg_count(iter));
+                       goto again;
+               }
+               pos += ret;
+               written += ret;
+               ret = 0;
+
+               balance_dirty_pages_ratelimited(mapping);
+       } while (iov_iter_count(iter));
+
+       bch2_pagecache_add_put(inode);
+
+       return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       ssize_t ret;
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               ret = bch2_direct_write(iocb, from);
+               goto out;
+       }
+
+       /* We can write back this queue in page reclaim */
+       current->backing_dev_info = inode_to_bdi(&inode->v);
+       inode_lock(&inode->v);
+
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto unlock;
+
+       ret = file_remove_privs(file);
+       if (ret)
+               goto unlock;
+
+       ret = file_update_time(file);
+       if (ret)
+               goto unlock;
+
+       ret = bch2_buffered_write(iocb, from);
+       if (likely(ret > 0))
+               iocb->ki_pos += ret;
+unlock:
+       inode_unlock(&inode->v);
+       current->backing_dev_info = NULL;
+
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
+out:
+       return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+       bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->writepage_bioset,
+                       4, offsetof(struct bch_writepage_io, op.wbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h
new file mode 100644 (file)
index 0000000..74eba21
--- /dev/null
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+                    unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+                  unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
new file mode 100644 (file)
index 0000000..a5cadc5
--- /dev/null
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+       struct closure                  cl;
+       struct kiocb                    *req;
+       long                            ret;
+       bool                            should_dirty;
+       struct bch_read_bio             rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+       if (check_dirty) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+       struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+       dio->req->ki_complete(dio->req, dio->ret);
+       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+       struct dio_read *dio = bio->bi_private;
+
+       if (bio->bi_status)
+               dio->ret = blk_status_to_errno(bio->bi_status);
+
+       closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+       struct dio_read *dio = bio->bi_private;
+       bool should_dirty = dio->should_dirty;
+
+       bch2_direct_IO_read_endio(bio);
+       bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts;
+       struct dio_read *dio;
+       struct bio *bio;
+       loff_t offset = req->ki_pos;
+       bool sync = is_sync_kiocb(req);
+       size_t shorten;
+       ssize_t ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       if ((offset|iter->count) & (block_bytes(c) - 1))
+               return -EINVAL;
+
+       ret = min_t(loff_t, iter->count,
+                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+       if (!ret)
+               return ret;
+
+       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+       iter->count -= shorten;
+
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_READ,
+                              GFP_KERNEL,
+                              &c->dio_read_bioset);
+
+       bio->bi_end_io = bch2_direct_IO_read_endio;
+
+       dio = container_of(bio, struct dio_read, rbio.bio);
+       closure_init(&dio->cl, NULL);
+
+       /*
+        * this is a _really_ horrible hack just to avoid an atomic sub at the
+        * end:
+        */
+       if (!sync) {
+               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER -
+                          CLOSURE_RUNNING +
+                          CLOSURE_DESTRUCTOR);
+       } else {
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER + 1);
+       }
+
+       dio->req        = req;
+       dio->ret        = ret;
+       /*
+        * This is one of the sketchier things I've encountered: we have to skip
+        * the dirtying of requests that are internal from the kernel (i.e. from
+        * loopback), because we'll deadlock on page_lock.
+        */
+       dio->should_dirty = iter_is_iovec(iter);
+
+       goto start;
+       while (iter->count) {
+               bio = bio_alloc_bioset(NULL,
+                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                                      REQ_OP_READ,
+                                      GFP_KERNEL,
+                                      &c->bio_read);
+               bio->bi_end_io          = bch2_direct_IO_read_split_endio;
+start:
+               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
+               bio->bi_iter.bi_sector  = offset >> 9;
+               bio->bi_private         = dio;
+
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (ret < 0) {
+                       /* XXX: fault inject this path */
+                       bio->bi_status = BLK_STS_RESOURCE;
+                       bio_endio(bio);
+                       break;
+               }
+
+               offset += bio->bi_iter.bi_size;
+
+               if (dio->should_dirty)
+                       bio_set_pages_dirty(bio);
+
+               if (iter->count)
+                       closure_get(&dio->cl);
+
+               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+       }
+
+       iter->count += shorten;
+
+       if (sync) {
+               closure_sync(&dio->cl);
+               closure_debug_destroy(&dio->cl);
+               ret = dio->ret;
+               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+               return ret;
+       } else {
+               return -EIOCBQUEUED;
+       }
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       size_t count = iov_iter_count(iter);
+       ssize_t ret;
+
+       if (!count)
+               return 0; /* skip atime */
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               struct blk_plug plug;
+
+               if (unlikely(mapping->nrpages)) {
+                       ret = filemap_write_and_wait_range(mapping,
+                                               iocb->ki_pos,
+                                               iocb->ki_pos + count - 1);
+                       if (ret < 0)
+                               goto out;
+               }
+
+               file_accessed(file);
+
+               blk_start_plug(&plug);
+               ret = bch2_direct_IO_read(iocb, iter);
+               blk_finish_plug(&plug);
+
+               if (ret >= 0)
+                       iocb->ki_pos += ret;
+       } else {
+               bch2_pagecache_add_get(inode);
+               ret = generic_file_read_iter(iocb, iter);
+               bch2_pagecache_add_put(inode);
+       }
+out:
+       return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+       struct kiocb                    *req;
+       struct address_space            *mapping;
+       struct bch_inode_info           *inode;
+       struct mm_struct                *mm;
+       unsigned                        loop:1,
+                                       extending:1,
+                                       sync:1,
+                                       flush:1,
+                                       free_iov:1;
+       struct quota_res                quota_res;
+       u64                             written;
+
+       struct iov_iter                 iter;
+       struct iovec                    inline_vecs[2];
+
+       /* must be last: */
+       struct bch_write_op             op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+                                      u64 offset, u64 size,
+                                      unsigned nr_replicas, bool compressed)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 end = offset + size;
+       u32 snapshot;
+       bool ret = true;
+       int err;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (err)
+               goto err;
+
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, err) {
+               if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+                       break;
+
+               if (k.k->p.snapshot != snapshot ||
+                   nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
+                       ret = false;
+                       break;
+               }
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+               goto retry;
+       bch2_trans_exit(&trans);
+
+       return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct bch_inode_info *inode = dio->inode;
+       struct bio *bio = &dio->op.wbio.bio;
+
+       return bch2_check_range_allocated(c, inode_inum(inode),
+                               dio->op.pos.offset, bio_sectors(bio),
+                               dio->op.opts.data_replicas,
+                               dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+       struct iovec *iov = dio->inline_vecs;
+
+       /*
+        * iov_iter has a single embedded iovec - nothing to do:
+        */
+       if (iter_is_ubuf(&dio->iter))
+               return 0;
+
+       /*
+        * We don't currently handle non-iovec iov_iters here - return an error,
+        * and we'll fall back to doing the IO synchronously:
+        */
+       if (!iter_is_iovec(&dio->iter))
+               return -1;
+
+       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+                                   GFP_KERNEL);
+               if (unlikely(!iov))
+                       return -ENOMEM;
+
+               dio->free_iov = true;
+       }
+
+       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+       dio->iter.__iov = iov;
+       return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+       struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+       struct bch_fs *c = dio->op.c;
+
+       closure_debug_destroy(cl);
+
+       dio->op.error = bch2_journal_error(&c->journal);
+
+       bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct bch_inode_unpacked inode;
+       int ret;
+
+       dio->flush = 0;
+
+       closure_init(&dio->op.cl, NULL);
+
+       if (!dio->op.error) {
+               ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+               if (ret) {
+                       dio->op.error = ret;
+               } else {
+                       bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
+                       bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+               }
+       }
+
+       if (dio->sync) {
+               closure_sync(&dio->op.cl);
+               closure_debug_destroy(&dio->op.cl);
+       } else {
+               continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+       }
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+       struct kiocb *req = dio->req;
+       struct bch_inode_info *inode = dio->inode;
+       bool sync = dio->sync;
+       long ret;
+
+       if (unlikely(dio->flush)) {
+               bch2_dio_write_flush(dio);
+               if (!sync)
+                       return -EIOCBQUEUED;
+       }
+
+       bch2_pagecache_block_put(inode);
+
+       if (dio->free_iov)
+               kfree(dio->iter.__iov);
+
+       ret = dio->op.error ?: ((long) dio->written << 9);
+       bio_put(&dio->op.wbio.bio);
+
+       /* inode->i_dio_count is our ref on inode and thus bch_fs */
+       inode_dio_end(&inode->v);
+
+       if (ret < 0)
+               ret = bch2_err_class(ret);
+
+       if (!sync) {
+               req->ki_complete(req, ret);
+               ret = -EIOCBQUEUED;
+       }
+       return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct kiocb *req = dio->req;
+       struct bch_inode_info *inode = dio->inode;
+       struct bio *bio = &dio->op.wbio.bio;
+
+       req->ki_pos     += (u64) dio->op.written << 9;
+       dio->written    += dio->op.written;
+
+       if (dio->extending) {
+               spin_lock(&inode->v.i_lock);
+               if (req->ki_pos > inode->v.i_size)
+                       i_size_write(&inode->v, req->ki_pos);
+               spin_unlock(&inode->v.i_lock);
+       }
+
+       if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+               __bch2_quota_reservation_put(c, inode, &dio->quota_res);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+
+       bio_release_pages(bio, false);
+
+       if (unlikely(dio->op.error))
+               set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct kiocb *req = dio->req;
+       struct address_space *mapping = dio->mapping;
+       struct bch_inode_info *inode = dio->inode;
+       struct bch_io_opts opts;
+       struct bio *bio = &dio->op.wbio.bio;
+       unsigned unaligned, iter_count;
+       bool sync = dio->sync, dropped_locks;
+       long ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       while (1) {
+               iter_count = dio->iter.count;
+
+               EBUG_ON(current->faults_disabled_mapping);
+               current->faults_disabled_mapping = mapping;
+
+               ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+               dropped_locks = fdm_dropped_locks();
+
+               current->faults_disabled_mapping = NULL;
+
+               /*
+                * If the fault handler returned an error but also signalled
+                * that it dropped & retook ei_pagecache_lock, we just need to
+                * re-shoot down the page cache and retry:
+                */
+               if (dropped_locks && ret)
+                       ret = 0;
+
+               if (unlikely(ret < 0))
+                       goto err;
+
+               if (unlikely(dropped_locks)) {
+                       ret = bch2_write_invalidate_inode_pages_range(mapping,
+                                       req->ki_pos,
+                                       req->ki_pos + iter_count - 1);
+                       if (unlikely(ret))
+                               goto err;
+
+                       if (!bio->bi_iter.bi_size)
+                               continue;
+               }
+
+               unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+               bio->bi_iter.bi_size -= unaligned;
+               iov_iter_revert(&dio->iter, unaligned);
+
+               if (!bio->bi_iter.bi_size) {
+                       /*
+                        * bio_iov_iter_get_pages was only able to get <
+                        * blocksize worth of pages:
+                        */
+                       ret = -EFAULT;
+                       goto err;
+               }
+
+               bch2_write_op_init(&dio->op, c, opts);
+               dio->op.end_io          = sync
+                       ? NULL
+                       : bch2_dio_write_loop_async;
+               dio->op.target          = dio->op.opts.foreground_target;
+               dio->op.write_point     = writepoint_hashed((unsigned long) current);
+               dio->op.nr_replicas     = dio->op.opts.data_replicas;
+               dio->op.subvol          = inode->ei_subvol;
+               dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+               dio->op.devs_need_flush = &inode->ei_devs_need_flush;
+
+               if (sync)
+                       dio->op.flags |= BCH_WRITE_SYNC;
+               dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+               ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+                                                bio_sectors(bio), true);
+               if (unlikely(ret))
+                       goto err;
+
+               ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+                                               dio->op.opts.data_replicas, 0);
+               if (unlikely(ret) &&
+                   !bch2_dio_write_check_allocated(dio))
+                       goto err;
+
+               task_io_account_write(bio->bi_iter.bi_size);
+
+               if (unlikely(dio->iter.count) &&
+                   !dio->sync &&
+                   !dio->loop &&
+                   bch2_dio_write_copy_iov(dio))
+                       dio->sync = sync = true;
+
+               dio->loop = true;
+               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+               if (!sync)
+                       return -EIOCBQUEUED;
+
+               bch2_dio_write_end(dio);
+
+               if (likely(!dio->iter.count) || dio->op.error)
+                       break;
+
+               bio_reset(bio, NULL, REQ_OP_WRITE);
+       }
+out:
+       return bch2_dio_write_done(dio);
+err:
+       dio->op.error = ret;
+
+       bio_release_pages(bio, false);
+
+       bch2_quota_reservation_put(c, inode, &dio->quota_res);
+       goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+       struct mm_struct *mm = dio->mm;
+
+       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+       if (mm)
+               kthread_use_mm(mm);
+       bch2_dio_write_loop(dio);
+       if (mm)
+               kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+       struct dio_write *dio = container_of(op, struct dio_write, op);
+
+       bch2_dio_write_end(dio);
+
+       if (likely(!dio->iter.count) || dio->op.error)
+               bch2_dio_write_done(dio);
+       else
+               bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct dio_write *dio;
+       struct bio *bio;
+       bool locked = true, extending;
+       ssize_t ret;
+
+       prefetch(&c->opts);
+       prefetch((void *) &c->opts + 64);
+       prefetch(&inode->ei_inode);
+       prefetch((void *) &inode->ei_inode + 64);
+
+       inode_lock(&inode->v);
+
+       ret = generic_write_checks(req, iter);
+       if (unlikely(ret <= 0))
+               goto err;
+
+       ret = file_remove_privs(file);
+       if (unlikely(ret))
+               goto err;
+
+       ret = file_update_time(file);
+       if (unlikely(ret))
+               goto err;
+
+       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+               goto err;
+
+       inode_dio_begin(&inode->v);
+       bch2_pagecache_block_get(inode);
+
+       extending = req->ki_pos + iter->count > inode->v.i_size;
+       if (!extending) {
+               inode_unlock(&inode->v);
+               locked = false;
+       }
+
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_WRITE,
+                              GFP_KERNEL,
+                              &c->dio_write_bioset);
+       dio = container_of(bio, struct dio_write, op.wbio.bio);
+       dio->req                = req;
+       dio->mapping            = mapping;
+       dio->inode              = inode;
+       dio->mm                 = current->mm;
+       dio->loop               = false;
+       dio->extending          = extending;
+       dio->sync               = is_sync_kiocb(req) || extending;
+       dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+       dio->free_iov           = false;
+       dio->quota_res.sectors  = 0;
+       dio->written            = 0;
+       dio->iter               = *iter;
+       dio->op.c               = c;
+
+       if (unlikely(mapping->nrpages)) {
+               ret = bch2_write_invalidate_inode_pages_range(mapping,
+                                               req->ki_pos,
+                                               req->ki_pos + iter->count - 1);
+               if (unlikely(ret))
+                       goto err_put_bio;
+       }
+
+       ret = bch2_dio_write_loop(dio);
+err:
+       if (locked)
+               inode_unlock(&inode->v);
+       return ret;
+err_put_bio:
+       bch2_pagecache_block_put(inode);
+       bio_put(bio);
+       inode_dio_end(&inode->v);
+       goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+       bioset_exit(&c->dio_write_bioset);
+       bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->dio_read_bioset,
+                       4, offsetof(struct dio_read, rbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+       if (bioset_init(&c->dio_write_bioset,
+                       4, offsetof(struct dio_write, op.wbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-direct.h b/libbcachefs/fs-io-direct.h
new file mode 100644 (file)
index 0000000..8e950cc
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
new file mode 100644 (file)
index 0000000..07c4bfe
--- /dev/null
@@ -0,0 +1,777 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+                                    loff_t start, u64 end,
+                                    int fgp_flags, gfp_t gfp,
+                                    folios *folios)
+{
+       struct folio *f;
+       u64 pos = start;
+       int ret = 0;
+
+       while (pos < end) {
+               if ((u64) pos >= (u64) start + (1ULL << 20))
+                       fgp_flags &= ~FGP_CREAT;
+
+               ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
+               if (ret)
+                       break;
+
+               f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+               if (IS_ERR_OR_NULL(f))
+                       break;
+
+               BUG_ON(folios->nr && folio_pos(f) != pos);
+
+               pos = folio_end_pos(f);
+               darray_push(folios, f);
+       }
+
+       if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
+               ret = -ENOMEM;
+
+       return folios->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+                                           loff_t start, loff_t end)
+{
+       int ret;
+
+       /*
+        * XXX: the way this is currently implemented, we can spin if a process
+        * is continually redirtying a specific page
+        */
+       do {
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = filemap_write_and_wait_range(mapping, start, end);
+               if (ret)
+                       break;
+
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = invalidate_inode_pages2_range(mapping,
+                               start >> PAGE_SHIFT,
+                               end >> PAGE_SHIFT);
+       } while (ret == -EBUSY);
+
+       return ret;
+}
+
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)   #n,
+       BCH_FOLIO_SECTOR_STATE()
+#undef x
+       NULL
+};
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_unallocated:
+               return SECTOR_dirty;
+       case SECTOR_reserved:
+               return SECTOR_dirty_reserved;
+       default:
+               return state;
+       }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_dirty:
+               return SECTOR_unallocated;
+       case SECTOR_dirty_reserved:
+               return SECTOR_reserved;
+       default:
+               return state;
+       }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_unallocated:
+               return SECTOR_reserved;
+       case SECTOR_dirty:
+               return SECTOR_dirty_reserved;
+       default:
+               return state;
+       }
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+       struct bch_folio *s;
+
+       s = kzalloc(sizeof(*s) +
+                   sizeof(struct bch_folio_sector) *
+                   folio_sectors(folio), gfp);
+       if (!s)
+               return NULL;
+
+       spin_lock_init(&s->lock);
+       folio_attach_private(folio, s);
+       return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+       return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+       if (bkey_extent_is_reservation(k))
+               return SECTOR_reserved;
+       if (bkey_extent_is_allocation(k.k))
+               return SECTOR_allocated;
+       return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+                            unsigned pg_offset, unsigned pg_len,
+                            unsigned nr_ptrs, unsigned state)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, sectors = folio_sectors(folio);
+
+       BUG_ON(pg_offset >= sectors);
+       BUG_ON(pg_offset + pg_len > sectors);
+
+       spin_lock(&s->lock);
+
+       for (i = pg_offset; i < pg_offset + pg_len; i++) {
+               s->s[i].nr_replicas     = nr_ptrs;
+               bch2_folio_sector_set(folio, s, i, state);
+       }
+
+       if (i == sectors)
+               s->uptodate = true;
+
+       spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+                  struct folio **folios, unsigned nr_folios)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_folio *s;
+       u64 offset = folio_sector(folios[0]);
+       unsigned folio_idx;
+       u32 snapshot;
+       bool need_set = false;
+       int ret;
+
+       for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+               s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
+               if (!s)
+                       return -ENOMEM;
+
+               need_set |= !s->uptodate;
+       }
+
+       if (!need_set)
+               return 0;
+
+       folio_idx = 0;
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, ret) {
+               unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+               unsigned state = bkey_to_sector_state(k);
+
+               while (folio_idx < nr_folios) {
+                       struct folio *folio = folios[folio_idx];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end   = folio_end_sector(folio);
+                       unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
+                       unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
+
+                       BUG_ON(k.k->p.offset < folio_start);
+                       BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+                       if (!bch2_folio(folio)->uptodate)
+                               __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+                       if (k.k->p.offset < folio_end)
+                               break;
+                       folio_idx++;
+               }
+
+               if (folio_idx == nr_folios)
+                       break;
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+       struct bvec_iter iter;
+       struct folio_vec fv;
+       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+       unsigned state = bkey_to_sector_state(k);
+
+       bio_for_each_folio(fv, bio, iter)
+               __bch2_folio_set(fv.fv_folio,
+                                fv.fv_offset >> 9,
+                                fv.fv_len >> 9,
+                                nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+                                    u64 start, u64 end)
+{
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct folio_batch fbatch;
+       unsigned i, j;
+
+       if (end <= start)
+               return;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end = folio_end_sector(folio);
+                       unsigned folio_offset = max(start, folio_start) - folio_start;
+                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+                       struct bch_folio *s;
+
+                       BUG_ON(end <= folio_start);
+
+                       folio_lock(folio);
+                       s = bch2_folio(folio);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = folio_offset; j < folio_offset + folio_len; j++)
+                                       s->s[j].nr_replicas = 0;
+                               spin_unlock(&s->lock);
+                       }
+
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+                                 u64 start, u64 end)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct folio_batch fbatch;
+       s64 i_sectors_delta = 0;
+       unsigned i, j;
+
+       if (end <= start)
+               return;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end = folio_end_sector(folio);
+                       unsigned folio_offset = max(start, folio_start) - folio_start;
+                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+                       struct bch_folio *s;
+
+                       BUG_ON(end <= folio_start);
+
+                       folio_lock(folio);
+                       s = bch2_folio(folio);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = folio_offset; j < folio_offset + folio_len; j++) {
+                                       i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+                                       bch2_folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
+                               }
+                               spin_unlock(&s->lock);
+                       }
+
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+                                         unsigned nr_replicas)
+{
+       return max(0, (int) nr_replicas -
+                  s->nr_replicas -
+                  s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+                               struct bch_inode_info *inode,
+                               struct folio *folio, bool check_enospc)
+{
+       struct bch_folio *s = bch2_folio_create(folio, 0);
+       unsigned nr_replicas = inode_nr_replicas(c, inode);
+       struct disk_reservation disk_res = { 0 };
+       unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+       int ret;
+
+       if (!s)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+       if (!disk_res_sectors)
+               return 0;
+
+       ret = bch2_disk_reservation_get(c, &disk_res,
+                                       disk_res_sectors, 1,
+                                       !check_enospc
+                                       ? BCH_DISK_RESERVATION_NOFAIL
+                                       : 0);
+       if (unlikely(ret))
+               return ret;
+
+       for (i = 0; i < sectors; i++)
+               s->s[i].replicas_reserved +=
+                       sectors_to_reserve(&s->s[i], nr_replicas);
+
+       return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct bch2_folio_reservation *res)
+{
+       bch2_disk_reservation_put(c, &res->disk);
+       bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct folio *folio,
+                       struct bch2_folio_reservation *res,
+                       unsigned offset, unsigned len)
+{
+       struct bch_folio *s = bch2_folio_create(folio, 0);
+       unsigned i, disk_sectors = 0, quota_sectors = 0;
+       int ret;
+
+       if (!s)
+               return -ENOMEM;
+
+       BUG_ON(!s->uptodate);
+
+       for (i = round_down(offset, block_bytes(c)) >> 9;
+            i < round_up(offset + len, block_bytes(c)) >> 9;
+            i++) {
+               disk_sectors += sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
+               quota_sectors += s->s[i].state == SECTOR_unallocated;
+       }
+
+       if (disk_sectors) {
+               ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       if (quota_sectors) {
+               ret = bch2_quota_reservation_add(c, inode, &res->quota,
+                                                quota_sectors, true);
+               if (unlikely(ret)) {
+                       struct disk_reservation tmp = {
+                               .sectors = disk_sectors
+                       };
+
+                       bch2_disk_reservation_put(c, &tmp);
+                       res->disk.sectors -= disk_sectors;
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_folio *s = bch2_folio(folio);
+       struct disk_reservation disk_res = { 0 };
+       int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+       if (!s)
+               return;
+
+       EBUG_ON(!folio_test_locked(folio));
+       EBUG_ON(folio_test_writeback(folio));
+
+       for (i = 0; i < sectors; i++) {
+               disk_res.sectors += s->s[i].replicas_reserved;
+               s->s[i].replicas_reserved = 0;
+
+               dirty_sectors -= s->s[i].state == SECTOR_dirty;
+               bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+       }
+
+       bch2_disk_reservation_put(c, &disk_res);
+
+       bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+       bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+                         struct bch_inode_info *inode,
+                         struct folio *folio,
+                         struct bch2_folio_reservation *res,
+                         unsigned offset, unsigned len)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, dirty_sectors = 0;
+
+       WARN_ON((u64) folio_pos(folio) + offset + len >
+               round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+       BUG_ON(!s->uptodate);
+
+       spin_lock(&s->lock);
+
+       for (i = round_down(offset, block_bytes(c)) >> 9;
+            i < round_up(offset + len, block_bytes(c)) >> 9;
+            i++) {
+               unsigned sectors = sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
+
+               /*
+                * This can happen if we race with the error path in
+                * bch2_writepage_io_done():
+                */
+               sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+               s->s[i].replicas_reserved += sectors;
+               res->disk.sectors -= sectors;
+
+               dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+               bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+       }
+
+       spin_unlock(&s->lock);
+
+       bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+       if (!folio_test_dirty(folio))
+               filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+       struct file *file = vmf->vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct address_space *fdm = faults_disabled_mapping();
+       struct bch_inode_info *inode = file_bch_inode(file);
+       vm_fault_t ret;
+
+       if (fdm == mapping)
+               return VM_FAULT_SIGBUS;
+
+       /* Lock ordering: */
+       if (fdm > mapping) {
+               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+               if (bch2_pagecache_add_tryget(inode))
+                       goto got_lock;
+
+               bch2_pagecache_block_put(fdm_host);
+
+               bch2_pagecache_add_get(inode);
+               bch2_pagecache_add_put(inode);
+
+               bch2_pagecache_block_get(fdm_host);
+
+               /* Signal that lock has been dropped: */
+               set_fdm_dropped_locks();
+               return VM_FAULT_SIGBUS;
+       }
+
+       bch2_pagecache_add_get(inode);
+got_lock:
+       ret = filemap_fault(vmf);
+       bch2_pagecache_add_put(inode);
+
+       return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+       struct folio *folio = page_folio(vmf->page);
+       struct file *file = vmf->vma->vm_file;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation res;
+       unsigned len;
+       loff_t isize;
+       vm_fault_t ret;
+
+       bch2_folio_reservation_init(c, inode, &res);
+
+       sb_start_pagefault(inode->v.i_sb);
+       file_update_time(file);
+
+       /*
+        * Not strictly necessary, but helps avoid dio writes livelocking in
+        * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+        * a bch2_write_invalidate_inode_pages_range() that works without dropping
+        * page lock before invalidating page
+        */
+       bch2_pagecache_add_get(inode);
+
+       folio_lock(folio);
+       isize = i_size_read(&inode->v);
+
+       if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+               folio_unlock(folio);
+               ret = VM_FAULT_NOPAGE;
+               goto out;
+       }
+
+       len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+       if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+           bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+               folio_unlock(folio);
+               ret = VM_FAULT_SIGBUS;
+               goto out;
+       }
+
+       bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+       bch2_folio_reservation_put(c, inode, &res);
+
+       folio_wait_stable(folio);
+       ret = VM_FAULT_LOCKED;
+out:
+       bch2_pagecache_add_put(inode);
+       sb_end_pagefault(inode->v.i_sb);
+
+       return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+       if (offset || length < folio_size(folio))
+               return;
+
+       bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+       if (folio_test_dirty(folio) || folio_test_writeback(folio))
+               return false;
+
+       bch2_clear_folio_bits(folio);
+       return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+                            unsigned min_replicas)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, sectors = folio_sectors(folio);
+
+       if (s)
+               for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+                       if (s->s[i].state >= SECTOR_dirty &&
+                           s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+                               return i << SECTOR_SHIFT;
+
+       return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+                               loff_t start_offset,
+                               loff_t end_offset,
+                               unsigned min_replicas,
+                               bool nonblock)
+{
+       struct folio_batch fbatch;
+       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
+       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
+       pgoff_t index           = start_index;
+       unsigned i;
+       loff_t ret;
+       int offset;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(vinode->i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+
+                       if (!nonblock) {
+                               folio_lock(folio);
+                       } else if (!folio_trylock(folio)) {
+                               folio_batch_release(&fbatch);
+                               return -EAGAIN;
+                       }
+
+                       offset = folio_data_offset(folio,
+                                       max(folio_pos(folio), start_offset),
+                                       min_replicas);
+                       if (offset >= 0) {
+                               ret = clamp(folio_pos(folio) + offset,
+                                           start_offset, end_offset);
+                               folio_unlock(folio);
+                               folio_batch_release(&fbatch);
+                               return ret;
+                       }
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       return end_offset;
+}
+
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+                             unsigned min_replicas, bool nonblock)
+{
+       struct folio *folio;
+       struct bch_folio *s;
+       unsigned i, sectors;
+       bool ret = true;
+
+       folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+                                   FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+       if (folio == ERR_PTR(-EAGAIN))
+               return -EAGAIN;
+       if (IS_ERR_OR_NULL(folio))
+               return true;
+
+       s = bch2_folio(folio);
+       if (!s)
+               goto unlock;
+
+       sectors = folio_sectors(folio);
+       for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+               if (s->s[i].state < SECTOR_dirty ||
+                   s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+                       *offset = max(*offset,
+                                     folio_pos(folio) + (i << SECTOR_SHIFT));
+                       goto unlock;
+               }
+
+       *offset = folio_end_pos(folio);
+       ret = false;
+unlock:
+       folio_unlock(folio);
+       folio_put(folio);
+       return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+                               loff_t start_offset,
+                               loff_t end_offset,
+                               unsigned min_replicas,
+                               bool nonblock)
+{
+       struct address_space *mapping = vinode->i_mapping;
+       loff_t offset = start_offset;
+
+       while (offset < end_offset &&
+              !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
+               ;
+
+       return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+                        u64 *hole_start,
+                        u64 *hole_end,
+                        unsigned min_replicas,
+                        bool nonblock)
+{
+       loff_t ret;
+
+       ret = bch2_seek_pagecache_hole(inode,
+               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+       if (ret < 0)
+               return ret;
+
+       *hole_start = ret;
+
+       if (*hole_start == *hole_end)
+               return 0;
+
+       ret = bch2_seek_pagecache_data(inode,
+               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+       if (ret < 0)
+               return ret;
+
+       *hole_end = ret;
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
new file mode 100644 (file)
index 0000000..f1c747c
--- /dev/null
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+                                    u64, int, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+       return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+       return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+       return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+       return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()       \
+       x(unallocated)                  \
+       x(reserved)                     \
+       x(dirty)                        \
+       x(dirty_reserved)               \
+       x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)   SECTOR_##n,
+       BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+       /* Uncompressed, fully allocated replicas (or on disk reservation): */
+       unsigned                nr_replicas:4;
+
+       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+       unsigned                replicas_reserved:4;
+
+       /* i_sectors: */
+       enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+       spinlock_t              lock;
+       atomic_t                write_count;
+       /*
+        * Is the sector state up to date with the btree?
+        * (Not the data itself)
+        */
+       bool                    uptodate;
+       struct bch_folio_sector s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+                            struct bch_folio *s,
+                            unsigned i, unsigned n)
+{
+       s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+       u64 f_offset = pos - folio_pos(folio);
+       BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+       return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+       kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+       EBUG_ON(!folio_test_locked(folio));
+       __bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+       return folio_has_private(folio)
+               ? (struct bch_folio *) folio_get_private(folio)
+               : NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+       EBUG_ON(!folio_test_locked(folio));
+
+       return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+       struct disk_reservation disk;
+       struct quota_res        quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+       /* XXX: this should not be open coded */
+       return inode->ei_inode.bi_data_replicas
+               ? inode->ei_inode.bi_data_replicas - 1
+               : c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct bch2_folio_reservation *res)
+{
+       memset(res, 0, sizeof(*res));
+
+       res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+                               struct bch_inode_info *,
+                               struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+                       struct bch_inode_info *,
+                       struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+                       struct bch_inode_info *,
+                       struct folio *,
+                       struct bch2_folio_reservation *,
+                       unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+                         struct bch_inode_info *,
+                         struct folio *,
+                         struct bch2_folio_reservation *,
+                         unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
index d433f4d5662da9fd8991823ca2c37d1c0aae3082..e846d1ebf64e1485a10eeeb43f5b6fc771dd5654 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_buf.h"
+//#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -12,6 +12,9 @@
 #include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+//#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
 #include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
-#include <linux/writeback.h>
 
 #include <trace/events/writeback.h>
 
-static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-struct folio_vec {
-       struct folio    *fv_folio;
-       size_t          fv_offset;
-       size_t          fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-       struct folio *folio     = page_folio(bv.bv_page);
-       size_t offset           = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-               bv.bv_offset;
-       size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-       return (struct folio_vec) {
-               .fv_folio       = folio,
-               .fv_offset      = offset,
-               .fv_len         = len,
-       };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-                                                   struct bvec_iter iter)
-{
-       return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)                    \
-       for (iter = (start);                                            \
-            (iter).bi_size &&                                          \
-               ((bvl = bio_iter_iovec_folio((bio), (iter))), 1);       \
-            bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)                             \
-       __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-       return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-       return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-       return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-       return folio_end_pos(folio) >> 9;
-}
-
-typedef DARRAY(struct folio *) folios;
-
-static int filemap_get_contig_folios_d(struct address_space *mapping,
-                                      loff_t start, u64 end,
-                                      int fgp_flags, gfp_t gfp,
-                                      folios *folios)
-{
-       struct folio *f;
-       u64 pos = start;
-       int ret = 0;
-
-       while (pos < end) {
-               if ((u64) pos >= (u64) start + (1ULL << 20))
-                       fgp_flags &= ~FGP_CREAT;
-
-               ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
-               if (ret)
-                       break;
-
-               f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-               if (IS_ERR_OR_NULL(f))
-                       break;
-
-               BUG_ON(folios->nr && folio_pos(f) != pos);
-
-               pos = folio_end_pos(f);
-               darray_push(folios, f);
-       }
-
-       if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
-               ret = -ENOMEM;
-
-       return folios->nr ? 0 : ret;
-}
-
-struct nocow_flush {
-       struct closure  *cl;
-       struct bch_dev  *ca;
-       struct bio      bio;
-};
-
-static void nocow_flush_endio(struct bio *_bio)
-{
-
-       struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-
-       closure_put(bio->cl);
-       percpu_ref_put(&bio->ca->io_ref);
-       bio_put(&bio->bio);
-}
-
-static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-                                               struct bch_inode_info *inode,
-                                               struct closure *cl)
-{
-       struct nocow_flush *bio;
-       struct bch_dev *ca;
-       struct bch_devs_mask devs;
-       unsigned dev;
-
-       dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-       if (dev == BCH_SB_MEMBERS_MAX)
-               return;
-
-       devs = inode->ei_devs_need_flush;
-       memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-
-       for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-               rcu_read_lock();
-               ca = rcu_dereference(c->devs[dev]);
-               if (ca && !percpu_ref_tryget(&ca->io_ref))
-                       ca = NULL;
-               rcu_read_unlock();
-
-               if (!ca)
-                       continue;
-
-               bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-                                                   REQ_OP_FLUSH,
-                                                   GFP_KERNEL,
-                                                   &c->nocow_flush_bioset),
-                                  struct nocow_flush, bio);
-               bio->cl                 = cl;
-               bio->ca                 = ca;
-               bio->bio.bi_end_io      = nocow_flush_endio;
-               closure_bio_submit(&bio->bio, cl);
-       }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-                                        struct bch_inode_info *inode)
-{
-       struct closure cl;
-
-       closure_init_stack(&cl);
-       bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-       closure_sync(&cl);
-
-       return 0;
-}
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-       if (bio->bi_vcnt >= bio->bi_max_vecs)
-               return true;
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
-               return true;
-       return false;
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-       current->faults_disabled_mapping =
-               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-       return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-struct quota_res {
-       u64                             sectors;
-};
-
-struct bch_writepage_io {
-       struct bch_inode_info           *inode;
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-struct dio_write {
-       struct kiocb                    *req;
-       struct address_space            *mapping;
-       struct bch_inode_info           *inode;
-       struct mm_struct                *mm;
-       unsigned                        loop:1,
-                                       extending:1,
-                                       sync:1,
-                                       flush:1,
-                                       free_iov:1;
-       struct quota_res                quota_res;
-       u64                             written;
-
-       struct iov_iter                 iter;
-       struct iovec                    inline_vecs[2];
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-struct dio_read {
-       struct closure                  cl;
-       struct kiocb                    *req;
-       long                            ret;
-       bool                            should_dirty;
-       struct bch_read_bio             rbio;
-};
-
-/* pagecache_block must be held */
-static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
-                                             loff_t start, loff_t end)
-{
-       int ret;
-
-       /*
-        * XXX: the way this is currently implemented, we can spin if a process
-        * is continually redirtying a specific page
-        */
-       do {
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = filemap_write_and_wait_range(mapping, start, end);
-               if (ret)
-                       break;
-
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = invalidate_inode_pages2_range(mapping,
-                               start >> PAGE_SHIFT,
-                               end >> PAGE_SHIFT);
-       } while (ret == -EBUSY);
-
-       return ret;
-}
-
-/* quotas */
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res)
-{
-       BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-                       -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-       inode->ei_quota_reserved -= res->sectors;
-       res->sectors = 0;
-}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res)
-{
-       if (res->sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __bch2_quota_reservation_put(c, inode, res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     u64 sectors,
-                                     bool check_enospc)
-{
-       int ret;
-
-       if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-               return 0;
-
-       mutex_lock(&inode->ei_quota_lock);
-       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-                             check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-       if (likely(!ret)) {
-               inode->ei_quota_reserved += sectors;
-               res->sectors += sectors;
-       }
-       mutex_unlock(&inode->ei_quota_lock);
-
-       return ret;
-}
-
-#else
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res) {}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res) {}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     unsigned sectors,
-                                     bool check_enospc)
-{
-       return 0;
-}
-
-#endif
-
-/* i_size updates: */
-
-struct inode_new_size {
-       loff_t          new_size;
-       u64             now;
-       unsigned        fields;
-};
-
-static int inode_set_size(struct bch_inode_info *inode,
-                         struct bch_inode_unpacked *bi,
-                         void *p)
-{
-       struct inode_new_size *s = p;
-
-       bi->bi_size = s->new_size;
-       if (s->fields & ATTR_ATIME)
-               bi->bi_atime = s->now;
-       if (s->fields & ATTR_MTIME)
-               bi->bi_mtime = s->now;
-       if (s->fields & ATTR_CTIME)
-               bi->bi_ctime = s->now;
-
-       return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      loff_t new_size, unsigned fields)
-{
-       struct inode_new_size s = {
-               .new_size       = new_size,
-               .now            = bch2_current_time(c),
-               .fields         = fields,
-       };
-
-       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, s64 sectors)
-{
-       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-                               inode->ei_inode.bi_sectors);
-       inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-       if (quota_res &&
-           !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-           sectors > 0) {
-               BUG_ON(sectors > quota_res->sectors);
-               BUG_ON(sectors > inode->ei_quota_reserved);
-
-               quota_res->sectors -= sectors;
-               inode->ei_quota_reserved -= sectors;
-       } else {
-               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-       }
-#endif
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, s64 sectors)
-{
-       if (sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __i_sectors_acct(c, inode, quota_res, sectors);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
-#define BCH_FOLIO_SECTOR_STATE()       \
-       x(unallocated)                  \
-       x(reserved)                     \
-       x(dirty)                        \
-       x(dirty_reserved)               \
-       x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)   SECTOR_##n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)   #n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-       NULL
-};
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_dirty;
-       case SECTOR_reserved:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_dirty:
-               return SECTOR_unallocated;
-       case SECTOR_dirty_reserved:
-               return SECTOR_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_reserved;
-       case SECTOR_dirty:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-struct bch_folio_sector {
-       /* Uncompressed, fully allocated replicas (or on disk reservation): */
-       unsigned                nr_replicas:4;
-
-       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-       unsigned                replicas_reserved:4;
-
-       /* i_sectors: */
-       enum bch_folio_sector_state state:8;
-};
-
-struct bch_folio {
-       spinlock_t              lock;
-       atomic_t                write_count;
-       /*
-        * Is the sector state up to date with the btree?
-        * (Not the data itself)
-        */
-       bool                    uptodate;
-       struct bch_folio_sector s[];
-};
-
-static inline void folio_sector_set(struct folio *folio,
-                            struct bch_folio *s,
-                            unsigned i, unsigned n)
-{
-       s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-       u64 f_offset = pos - folio_pos(folio);
-       BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-       return f_offset >> SECTOR_SHIFT;
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-       return folio_has_private(folio)
-               ? (struct bch_folio *) folio_get_private(folio)
-               : NULL;
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-
-       return __bch2_folio(folio);
-}
-
-/* for newly allocated folios: */
-static void __bch2_folio_release(struct folio *folio)
-{
-       kfree(folio_detach_private(folio));
-}
-
-static void bch2_folio_release(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-       __bch2_folio_release(folio);
-}
-
-/* for newly allocated folios: */
-static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       struct bch_folio *s;
-
-       s = kzalloc(sizeof(*s) +
-                   sizeof(struct bch_folio_sector) *
-                   folio_sectors(folio), gfp);
-       if (!s)
-               return NULL;
-
-       spin_lock_init(&s->lock);
-       folio_attach_private(folio, s);
-       return s;
-}
-
-static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-       if (bkey_extent_is_reservation(k))
-               return SECTOR_reserved;
-       if (bkey_extent_is_allocation(k.k))
-               return SECTOR_allocated;
-       return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-                            unsigned pg_offset, unsigned pg_len,
-                            unsigned nr_ptrs, unsigned state)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       BUG_ON(pg_offset >= sectors);
-       BUG_ON(pg_offset + pg_len > sectors);
-
-       spin_lock(&s->lock);
-
-       for (i = pg_offset; i < pg_offset + pg_len; i++) {
-               s->s[i].nr_replicas     = nr_ptrs;
-               folio_sector_set(folio, s, i, state);
-       }
-
-       if (i == sectors)
-               s->uptodate = true;
-
-       spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-                         struct folio **folios, unsigned nr_folios)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bch_folio *s;
-       u64 offset = folio_sector(folios[0]);
-       unsigned folio_idx;
-       u32 snapshot;
-       bool need_set = false;
-       int ret;
-
-       for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-               s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
-               if (!s)
-                       return -ENOMEM;
-
-               need_set |= !s->uptodate;
-       }
-
-       if (!need_set)
-               return 0;
-
-       folio_idx = 0;
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-                          SPOS(inum.inum, offset, snapshot),
-                          BTREE_ITER_SLOTS, k, ret) {
-               unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-               unsigned state = bkey_to_sector_state(k);
-
-               while (folio_idx < nr_folios) {
-                       struct folio *folio = folios[folio_idx];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end   = folio_end_sector(folio);
-                       unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
-                       unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
-
-                       BUG_ON(k.k->p.offset < folio_start);
-                       BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-                       if (!bch2_folio(folio)->uptodate)
-                               __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-                       if (k.k->p.offset < folio_end)
-                               break;
-                       folio_idx++;
-               }
-
-               if (folio_idx == nr_folios)
-                       break;
-       }
-
-       offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
-
-static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-       struct bvec_iter iter;
-       struct folio_vec fv;
-       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-       unsigned state = bkey_to_sector_state(k);
-
-       bio_for_each_folio(fv, bio, iter)
-               __bch2_folio_set(fv.fv_folio,
-                                fv.fv_offset >> 9,
-                                fv.fv_len >> 9,
-                                nr_ptrs, state);
-}
-
-static void mark_pagecache_unallocated(struct bch_inode_info *inode,
-                                      u64 start, u64 end)
-{
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       unsigned i, j;
-
-       if (end <= start)
-               return;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
-
-                       BUG_ON(end <= folio_start);
-
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++)
-                                       s->s[j].nr_replicas = 0;
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-}
-
-static void mark_pagecache_reserved(struct bch_inode_info *inode,
-                                   u64 start, u64 end)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       s64 i_sectors_delta = 0;
-       unsigned i, j;
-
-       if (end <= start)
-               return;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
-
-                       BUG_ON(end <= folio_start);
-
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++) {
-                                       i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-                                       folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
-                               }
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
-}
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       /* XXX: this should not be open coded */
-       return inode->ei_inode.bi_data_replicas
-               ? inode->ei_inode.bi_data_replicas - 1
-               : c->opts.data_replicas;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-                                         unsigned nr_replicas)
-{
-       return max(0, (int) nr_replicas -
-                  s->nr_replicas -
-                  s->replicas_reserved);
-}
-
-static int bch2_get_folio_disk_reservation(struct bch_fs *c,
-                               struct bch_inode_info *inode,
-                               struct folio *folio, bool check_enospc)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned nr_replicas = inode_nr_replicas(c, inode);
-       struct disk_reservation disk_res = { 0 };
-       unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       for (i = 0; i < sectors; i++)
-               disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-       if (!disk_res_sectors)
-               return 0;
-
-       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       disk_res_sectors, 1,
-                                       !check_enospc
-                                       ? BCH_DISK_RESERVATION_NOFAIL
-                                       : 0);
-       if (unlikely(ret))
-               return ret;
-
-       for (i = 0; i < sectors; i++)
-               s->s[i].replicas_reserved +=
-                       sectors_to_reserve(&s->s[i], nr_replicas);
-
-       return 0;
-}
-
-struct bch2_folio_reservation {
-       struct disk_reservation disk;
-       struct quota_res        quota;
-};
-
-static void bch2_folio_reservation_init(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       memset(res, 0, sizeof(*res));
-
-       res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-static void bch2_folio_reservation_put(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       bch2_disk_reservation_put(c, &res->disk);
-       bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int bch2_folio_reservation_get(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       unsigned offset, unsigned len)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned i, disk_sectors = 0, quota_sectors = 0;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       BUG_ON(!s->uptodate);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               disk_sectors += sectors_to_reserve(&s->s[i],
-                                               res->disk.nr_replicas);
-               quota_sectors += s->s[i].state == SECTOR_unallocated;
-       }
-
-       if (disk_sectors) {
-               ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
-               if (unlikely(ret))
-                       return ret;
-       }
-
-       if (quota_sectors) {
-               ret = bch2_quota_reservation_add(c, inode, &res->quota,
-                                                quota_sectors, true);
-               if (unlikely(ret)) {
-                       struct disk_reservation tmp = {
-                               .sectors = disk_sectors
-                       };
-
-                       bch2_disk_reservation_put(c, &tmp);
-                       res->disk.sectors -= disk_sectors;
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_folio *s = bch2_folio(folio);
-       struct disk_reservation disk_res = { 0 };
-       int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-       if (!s)
-               return;
-
-       EBUG_ON(!folio_test_locked(folio));
-       EBUG_ON(folio_test_writeback(folio));
-
-       for (i = 0; i < sectors; i++) {
-               disk_res.sectors += s->s[i].replicas_reserved;
-               s->s[i].replicas_reserved = 0;
-
-               dirty_sectors -= s->s[i].state == SECTOR_dirty;
-               folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-       }
-
-       bch2_disk_reservation_put(c, &disk_res);
-
-       i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-       bch2_folio_release(folio);
-}
-
-static void bch2_set_folio_dirty(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       unsigned offset, unsigned len)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, dirty_sectors = 0;
-
-       WARN_ON((u64) folio_pos(folio) + offset + len >
-               round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-       BUG_ON(!s->uptodate);
-
-       spin_lock(&s->lock);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               unsigned sectors = sectors_to_reserve(&s->s[i],
-                                               res->disk.nr_replicas);
-
-               /*
-                * This can happen if we race with the error path in
-                * bch2_writepage_io_done():
-                */
-               sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-               s->s[i].replicas_reserved += sectors;
-               res->disk.sectors -= sectors;
-
-               dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-               folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-       }
-
-       spin_unlock(&s->lock);
-
-       i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-       if (!folio_test_dirty(folio))
-               filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-       struct file *file = vmf->vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct address_space *fdm = faults_disabled_mapping();
-       struct bch_inode_info *inode = file_bch_inode(file);
-       vm_fault_t ret;
-
-       if (fdm == mapping)
-               return VM_FAULT_SIGBUS;
-
-       /* Lock ordering: */
-       if (fdm > mapping) {
-               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-               if (bch2_pagecache_add_tryget(inode))
-                       goto got_lock;
-
-               bch2_pagecache_block_put(fdm_host);
-
-               bch2_pagecache_add_get(inode);
-               bch2_pagecache_add_put(inode);
-
-               bch2_pagecache_block_get(fdm_host);
-
-               /* Signal that lock has been dropped: */
-               set_fdm_dropped_locks();
-               return VM_FAULT_SIGBUS;
-       }
-
-       bch2_pagecache_add_get(inode);
-got_lock:
-       ret = filemap_fault(vmf);
-       bch2_pagecache_add_put(inode);
-
-       return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-       struct folio *folio = page_folio(vmf->page);
-       struct file *file = vmf->vma->vm_file;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       unsigned len;
-       loff_t isize;
-       vm_fault_t ret;
-
-       bch2_folio_reservation_init(c, inode, &res);
-
-       sb_start_pagefault(inode->v.i_sb);
-       file_update_time(file);
-
-       /*
-        * Not strictly necessary, but helps avoid dio writes livelocking in
-        * write_invalidate_inode_pages_range() - can drop this if/when we get
-        * a write_invalidate_inode_pages_range() that works without dropping
-        * page lock before invalidating page
-        */
-       bch2_pagecache_add_get(inode);
-
-       folio_lock(folio);
-       isize = i_size_read(&inode->v);
-
-       if (folio->mapping != mapping || folio_pos(folio) >= isize) {
-               folio_unlock(folio);
-               ret = VM_FAULT_NOPAGE;
-               goto out;
-       }
-
-       len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
-       if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-           bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
-               folio_unlock(folio);
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
-
-       bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       folio_wait_stable(folio);
-       ret = VM_FAULT_LOCKED;
-out:
-       bch2_pagecache_add_put(inode);
-       sb_end_pagefault(inode->v.i_sb);
-
-       return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-       if (offset || length < folio_size(folio))
-               return;
-
-       bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-       if (folio_test_dirty(folio) || folio_test_writeback(folio))
-               return false;
-
-       bch2_clear_folio_bits(folio);
-       return true;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-       struct folio_iter fi;
-
-       bio_for_each_folio_all(fi, bio) {
-               if (!bio->bi_status) {
-                       folio_mark_uptodate(fi.folio);
-               } else {
-                       folio_clear_uptodate(fi.folio);
-                       folio_set_error(fi.folio);
-               }
-               folio_unlock(fi.folio);
-       }
-
-       bio_put(bio);
-}
-
-struct readpages_iter {
-       struct address_space    *mapping;
-       unsigned                idx;
-       folios                  folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-                              struct readahead_control *ractl)
-{
-       struct folio **fi;
-       int ret;
-
-       memset(iter, 0, sizeof(*iter));
-
-       iter->mapping = ractl->mapping;
-
-       ret = filemap_get_contig_folios_d(iter->mapping,
-                               ractl->_index << PAGE_SHIFT,
-                               (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
-                               0, mapping_gfp_mask(iter->mapping),
-                               &iter->folios);
-       if (ret)
-               return ret;
-
-       darray_for_each(iter->folios, fi) {
-               ractl->_nr_pages -= 1U << folio_order(*fi);
-               __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
-               folio_put(*fi);
-               folio_put(*fi);
-       }
-
-       return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-       if (iter->idx >= iter->folios.nr)
-               return NULL;
-       return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-       iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
-
-       bkey_for_each_crc(k.k, ptrs, crc, i)
-               if (crc.csum_type || crc.compression_type)
-                       return true;
-       return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-                              struct readpages_iter *iter,
-                              struct bio *bio,
-                              unsigned sectors_this_extent,
-                              bool get_more)
-{
-       /* Don't hold btree locks while allocating memory: */
-       bch2_trans_unlock(trans);
-
-       while (bio_sectors(bio) < sectors_this_extent &&
-              bio->bi_vcnt < bio->bi_max_vecs) {
-               struct folio *folio = readpage_iter_peek(iter);
-               int ret;
-
-               if (folio) {
-                       readpage_iter_advance(iter);
-               } else {
-                       pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-                       if (!get_more)
-                               break;
-
-                       folio = xa_load(&iter->mapping->i_pages, folio_offset);
-                       if (folio && !xa_is_value(folio))
-                               break;
-
-                       folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
-                       if (!folio)
-                               break;
-
-                       if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-                               folio_put(folio);
-                               break;
-                       }
-
-                       ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-                       if (ret) {
-                               __bch2_folio_release(folio);
-                               folio_put(folio);
-                               break;
-                       }
-
-                       folio_put(folio);
-               }
-
-               BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-               BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-       }
-
-       return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-                      struct bch_read_bio *rbio,
-                      subvol_inum inum,
-                      struct readpages_iter *readpages_iter)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       int flags = BCH_READ_RETRY_IF_STALE|
-               BCH_READ_MAY_PROMOTE;
-       u32 snapshot;
-       int ret = 0;
-
-       rbio->c = c;
-       rbio->start_time = local_clock();
-       rbio->subvol = inum.subvol;
-
-       bch2_bkey_buf_init(&sk);
-retry:
-       bch2_trans_begin(trans);
-       iter = (struct btree_iter) { NULL };
-
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-                            BTREE_ITER_SLOTS);
-       while (1) {
-               struct bkey_s_c k;
-               unsigned bytes, sectors, offset_into_extent;
-               enum btree_id data_btree = BTREE_ID_extents;
-
-               /*
-                * read_extent -> io_time_reset may cause a transaction restart
-                * without returning an error, we need to check for that here:
-                */
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       break;
-
-               bch2_btree_iter_set_pos(&iter,
-                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       break;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               sectors = min(sectors, k.k->size - offset_into_extent);
-
-               if (readpages_iter) {
-                       ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-                                                 extent_partial_reads_expensive(k));
-                       if (ret)
-                               break;
-               }
-
-               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-
-               if (rbio->bio.bi_iter.bi_size == bytes)
-                       flags |= BCH_READ_LAST_FRAGMENT;
-
-               bch2_bio_page_state_set(&rbio->bio, k);
-
-               bch2_read_extent(trans, rbio, iter.pos,
-                                data_btree, k, offset_into_extent, flags);
-
-               if (flags & BCH_READ_LAST_FRAGMENT)
-                       break;
-
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-               bio_advance(&rbio->bio, bytes);
-
-               ret = btree_trans_too_many_iters(trans);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (ret) {
-               bch_err_inum_offset_ratelimited(c,
-                               iter.pos.inode,
-                               iter.pos.offset << 9,
-                               "read error %i from btree lookup", ret);
-               rbio->bio.bi_status = BLK_STS_IOERR;
-               bio_endio(&rbio->bio);
-       }
-
-       bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct btree_trans trans;
-       struct folio *folio;
-       struct readpages_iter readpages_iter;
-       int ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       ret = readpages_iter_init(&readpages_iter, ractl);
-       BUG_ON(ret);
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_pagecache_add_get(inode);
-
-       while ((folio = readpage_iter_peek(&readpages_iter))) {
-               unsigned n = min_t(unsigned,
-                                  readpages_iter.folios.nr -
-                                  readpages_iter.idx,
-                                  BIO_MAX_VECS);
-               struct bch_read_bio *rbio =
-                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-                                                  GFP_KERNEL, &c->bio_read),
-                                 opts);
-
-               readpage_iter_advance(&readpages_iter);
-
-               rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-               rbio->bio.bi_end_io = bch2_readpages_end_io;
-               BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-               bchfs_read(&trans, rbio, inode_inum(inode),
-                          &readpages_iter);
-               bch2_trans_unlock(&trans);
-       }
-
-       bch2_pagecache_add_put(inode);
-
-       bch2_trans_exit(&trans);
-       darray_exit(&readpages_iter.folios);
-}
-
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
-                            subvol_inum inum, struct folio *folio)
-{
-       struct btree_trans trans;
-
-       bch2_folio_create(folio, __GFP_NOFAIL);
-
-       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-       rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-       BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-       bch2_trans_init(&trans, c, 0, 0);
-       bchfs_read(&trans, rbio, inum, NULL);
-       bch2_trans_exit(&trans);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-       complete(bio->bi_private);
-}
-
-static int bch2_read_single_folio(struct folio *folio,
-                                 struct address_space *mapping)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_read_bio *rbio;
-       struct bch_io_opts opts;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-                        opts);
-       rbio->bio.bi_private = &done;
-       rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
-       __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
-       wait_for_completion(&done);
-
-       ret = blk_status_to_errno(rbio->bio.bi_status);
-       bio_put(&rbio->bio);
-
-       if (ret < 0)
-               return ret;
-
-       folio_mark_uptodate(folio);
-       return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-       int ret;
-
-       ret = bch2_read_single_folio(folio, folio->mapping);
-       folio_unlock(folio);
-       return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_state {
-       struct bch_writepage_io *io;
-       struct bch_io_opts      opts;
-       struct bch_folio_sector *tmp;
-       unsigned                tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-                                                                 struct bch_inode_info *inode)
-{
-       struct bch_writepage_state ret = { 0 };
-
-       bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
-       return ret;
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-       struct bch_writepage_io *io =
-               container_of(op, struct bch_writepage_io, op);
-       struct bch_fs *c = io->op.c;
-       struct bio *bio = &io->op.wbio.bio;
-       struct folio_iter fi;
-       unsigned i;
-
-       if (io->op.error) {
-               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       folio_set_error(fi.folio);
-                       mapping_set_error(fi.folio->mapping, -EIO);
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       /*
-        * racing with fallocate can cause us to add fewer sectors than
-        * expected - but we shouldn't add more sectors than expected:
-        */
-       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-       /*
-        * (error (due to going RO) halfway through a page can screw that up
-        * slightly)
-        * XXX wtf?
-          BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-        */
-
-       /*
-        * PageWriteback is effectively our ref on the inode - fixup i_blocks
-        * before calling end_page_writeback:
-        */
-       i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-       bio_for_each_folio_all(fi, bio) {
-               struct bch_folio *s = __bch2_folio(fi.folio);
-
-               if (atomic_dec_and_test(&s->write_count))
-                       folio_end_writeback(fi.folio);
-       }
-
-       bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-       struct bch_writepage_io *io = w->io;
-
-       w->io = NULL;
-       closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-                                   struct writeback_control *wbc,
-                                   struct bch_writepage_state *w,
-                                   struct bch_inode_info *inode,
-                                   u64 sector,
-                                   unsigned nr_replicas)
-{
-       struct bch_write_op *op;
-
-       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-                                             REQ_OP_WRITE,
-                                             GFP_KERNEL,
-                                             &c->writepage_bioset),
-                            struct bch_writepage_io, op.wbio.bio);
-
-       w->io->inode            = inode;
-       op                      = &w->io->op;
-       bch2_write_op_init(op, c, w->opts);
-       op->target              = w->opts.foreground_target;
-       op->nr_replicas         = nr_replicas;
-       op->res.nr_replicas     = nr_replicas;
-       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
-       op->subvol              = inode->ei_subvol;
-       op->pos                 = POS(inode->v.i_ino, sector);
-       op->end_io              = bch2_writepage_io_done;
-       op->devs_need_flush     = &inode->ei_devs_need_flush;
-       op->wbio.bio.bi_iter.bi_sector = sector;
-       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-                           struct writeback_control *wbc,
-                           void *data)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_writepage_state *w = data;
-       struct bch_folio *s;
-       unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-       loff_t i_size = i_size_read(&inode->v);
-       int ret;
-
-       EBUG_ON(!folio_test_uptodate(folio));
-
-       /* Is the folio fully inside i_size? */
-       if (folio_end_pos(folio) <= i_size)
-               goto do_io;
-
-       /* Is the folio fully outside i_size? (truncate in progress) */
-       if (folio_pos(folio) >= i_size) {
-               folio_unlock(folio);
-               return 0;
-       }
-
-       /*
-        * The folio straddles i_size.  It must be zeroed out on each and every
-        * writepage invocation because it may be mmapped.  "A file is mapped
-        * in multiples of the folio size.  For a file that is not a multiple of
-        * the  folio size, the remaining memory is zeroed when mapped, and
-        * writes to that region are not written out to the file."
-        */
-       folio_zero_segment(folio,
-                          i_size - folio_pos(folio),
-                          folio_size(folio));
-do_io:
-       f_sectors = folio_sectors(folio);
-       s = bch2_folio(folio);
-
-       if (f_sectors > w->tmp_sectors) {
-               kfree(w->tmp);
-               w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
-                                f_sectors, __GFP_NOFAIL);
-               w->tmp_sectors = f_sectors;
-       }
-
-       /*
-        * Things get really hairy with errors during writeback:
-        */
-       ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-       BUG_ON(ret);
-
-       /* Before unlocking the page, get copy of reservations: */
-       spin_lock(&s->lock);
-       memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               nr_replicas_this_write =
-                       min_t(unsigned, nr_replicas_this_write,
-                             s->s[i].nr_replicas +
-                             s->s[i].replicas_reserved);
-       }
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               s->s[i].nr_replicas = w->opts.compression
-                       ? 0 : nr_replicas_this_write;
-
-               s->s[i].replicas_reserved = 0;
-               folio_sector_set(folio, s, i, SECTOR_allocated);
-       }
-       spin_unlock(&s->lock);
-
-       BUG_ON(atomic_read(&s->write_count));
-       atomic_set(&s->write_count, 1);
-
-       BUG_ON(folio_test_writeback(folio));
-       folio_start_writeback(folio);
-
-       folio_unlock(folio);
-
-       offset = 0;
-       while (1) {
-               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-               u64 sector;
-
-               while (offset < f_sectors &&
-                      w->tmp[offset].state < SECTOR_dirty)
-                       offset++;
-
-               if (offset == f_sectors)
-                       break;
-
-               while (offset + sectors < f_sectors &&
-                      w->tmp[offset + sectors].state >= SECTOR_dirty) {
-                       reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-                       dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-                       sectors++;
-               }
-               BUG_ON(!sectors);
-
-               sector = folio_sector(folio) + offset;
-
-               if (w->io &&
-                   (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-                    bio_full(&w->io->op.wbio.bio, sectors << 9) ||
-                    w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
-                    (BIO_MAX_VECS * PAGE_SIZE) ||
-                    bio_end_sector(&w->io->op.wbio.bio) != sector))
-                       bch2_writepage_do_io(w);
-
-               if (!w->io)
-                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-                                               nr_replicas_this_write);
-
-               atomic_inc(&s->write_count);
-
-               BUG_ON(inode != w->io->inode);
-               BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-                                    sectors << 9, offset << 9));
-
-               /* Check for writing past i_size: */
-               WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-                         round_up(i_size, block_bytes(c)) &&
-                         !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
-                         "writing past i_size: %llu > %llu (unrounded %llu)\n",
-                         bio_end_sector(&w->io->op.wbio.bio) << 9,
-                         round_up(i_size, block_bytes(c)),
-                         i_size);
-
-               w->io->op.res.sectors += reserved_sectors;
-               w->io->op.i_sectors_delta -= dirty_sectors;
-               w->io->op.new_i_size = i_size;
-
-               offset += sectors;
-       }
-
-       if (atomic_dec_and_test(&s->write_count))
-               folio_end_writeback(folio);
-
-       return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state w =
-               bch_writepage_state_init(c, to_bch_ei(mapping->host));
-       struct blk_plug plug;
-       int ret;
-
-       blk_start_plug(&plug);
-       ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-       if (w.io)
-               bch2_writepage_do_io(&w);
-       blk_finish_plug(&plug);
-       kfree(w.tmp);
-       return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
-                    loff_t pos, unsigned len,
-                    struct page **pagep, void **fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res;
-       struct folio *folio;
-       unsigned offset;
-       int ret = -ENOMEM;
-
-       res = kmalloc(sizeof(*res), GFP_KERNEL);
-       if (!res)
-               return -ENOMEM;
-
-       bch2_folio_reservation_init(c, inode, res);
-       *fsdata = res;
-
-       bch2_pagecache_add_get(inode);
-
-       folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-                               FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-                               mapping_gfp_mask(mapping));
-       if (IS_ERR_OR_NULL(folio))
-               goto err_unlock;
-
-       if (folio_test_uptodate(folio))
-               goto out;
-
-       offset = pos - folio_pos(folio);
-       len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-       /* If we're writing entire folio, don't need to read it in first: */
-       if (!offset && len == folio_size(folio))
-               goto out;
-
-       if (!offset && pos + len >= inode->v.i_size) {
-               folio_zero_segment(folio, len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-
-       if (folio_pos(folio) >= inode->v.i_size) {
-               folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-readpage:
-       ret = bch2_read_single_folio(folio, mapping);
-       if (ret)
-               goto err;
-out:
-       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-       if (ret)
-               goto err;
-
-       ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-       if (ret) {
-               if (!folio_test_uptodate(folio)) {
-                       /*
-                        * If the folio hasn't been read in, we won't know if we
-                        * actually need a reservation - we don't actually need
-                        * to read here, we just need to check if the folio is
-                        * fully backed by uncompressed data:
-                        */
-                       goto readpage;
-               }
-
-               goto err;
-       }
-
-       *pagep = &folio->page;
-       return 0;
-err:
-       folio_unlock(folio);
-       folio_put(folio);
-       *pagep = NULL;
-err_unlock:
-       bch2_pagecache_add_put(inode);
-       kfree(res);
-       *fsdata = NULL;
-       return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
-                  loff_t pos, unsigned len, unsigned copied,
-                  struct page *page, void *fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res = fsdata;
-       struct folio *folio = page_folio(page);
-       unsigned offset = pos - folio_pos(folio);
-
-       lockdep_assert_held(&inode->v.i_rwsem);
-       BUG_ON(offset + copied > folio_size(folio));
-
-       if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-               /*
-                * The folio needs to be read in, but that would destroy
-                * our partial write - simplest thing is to just force
-                * userspace to redo the write:
-                */
-               folio_zero_range(folio, 0, folio_size(folio));
-               flush_dcache_folio(folio);
-               copied = 0;
-       }
-
-       spin_lock(&inode->v.i_lock);
-       if (pos + copied > inode->v.i_size)
-               i_size_write(&inode->v, pos + copied);
-       spin_unlock(&inode->v.i_lock);
-
-       if (copied) {
-               if (!folio_test_uptodate(folio))
-                       folio_mark_uptodate(folio);
-
-               bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-               inode->ei_last_dirtied = (unsigned long) current;
-       }
-
-       folio_unlock(folio);
-       folio_put(folio);
-       bch2_pagecache_add_put(inode);
-
-       bch2_folio_reservation_put(c, inode, res);
-       kfree(res);
-
-       return copied;
-}
-
-static noinline void folios_trunc(folios *folios, struct folio **fi)
-{
-       while (folios->data + folios->nr > fi) {
-               struct folio *f = darray_pop(folios);
-
-               folio_unlock(f);
-               folio_put(f);
-       }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-                                struct address_space *mapping,
-                                struct iov_iter *iter,
-                                loff_t pos, unsigned len)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       folios folios;
-       struct folio **fi, *f;
-       unsigned copied = 0, f_offset;
-       u64 end = pos + len, f_pos;
-       loff_t last_folio_pos = inode->v.i_size;
-       int ret = 0;
-
-       BUG_ON(!len);
-
-       bch2_folio_reservation_init(c, inode, &res);
-       darray_init(&folios);
-
-       ret = filemap_get_contig_folios_d(mapping, pos, end,
-                                  FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
-                                  mapping_gfp_mask(mapping),
-                                  &folios);
-       if (ret)
-               goto out;
-
-       BUG_ON(!folios.nr);
-
-       f = darray_first(folios);
-       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-               ret = bch2_read_single_folio(f, mapping);
-               if (ret)
-                       goto out;
-       }
-
-       f = darray_last(folios);
-       end = min(end, folio_end_pos(f));
-       last_folio_pos = folio_pos(f);
-       if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-               if (end >= inode->v.i_size) {
-                       folio_zero_range(f, 0, folio_size(f));
-               } else {
-                       ret = bch2_read_single_folio(f, mapping);
-                       if (ret)
-                               goto out;
-               }
-       }
-
-       ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
-       if (ret)
-               goto out;
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-               /*
-                * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
-                * supposed to write as much as we have disk space for.
-                *
-                * On failure here we should still write out a partial page if
-                * we aren't completely out of disk space - we don't do that
-                * yet:
-                */
-               ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
-               if (unlikely(ret)) {
-                       folios_trunc(&folios, fi);
-                       if (!folios.nr)
-                               goto out;
-
-                       end = min(end, folio_end_pos(darray_last(folios)));
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (mapping_writably_mapped(mapping))
-               darray_for_each(folios, fi)
-                       flush_dcache_folio(*fi);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-               unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
-               if (!f_copied) {
-                       folios_trunc(&folios, fi);
-                       break;
-               }
-
-               if (!folio_test_uptodate(f) &&
-                   f_copied != folio_size(f) &&
-                   pos + copied + f_copied < inode->v.i_size) {
-                       folio_zero_range(f, 0, folio_size(f));
-                       folios_trunc(&folios, fi);
-                       break;
-               }
-
-               flush_dcache_folio(f);
-               copied += f_copied;
-
-               if (f_copied != f_len) {
-                       folios_trunc(&folios, fi + 1);
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (!copied)
-               goto out;
-
-       end = pos + copied;
-
-       spin_lock(&inode->v.i_lock);
-       if (end > inode->v.i_size)
-               i_size_write(&inode->v, end);
-       spin_unlock(&inode->v.i_lock);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-               if (!folio_test_uptodate(f))
-                       folio_mark_uptodate(f);
-
-               bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       inode->ei_last_dirtied = (unsigned long) current;
-out:
-       darray_for_each(folios, fi) {
-               folio_unlock(*fi);
-               folio_put(*fi);
-       }
-
-       /*
-        * If the last folio added to the mapping starts beyond current EOF, we
-        * performed a short write but left around at least one post-EOF folio.
-        * Clean up the mapping before we return.
-        */
-       if (last_folio_pos >= inode->v.i_size)
-               truncate_pagecache(&inode->v, inode->v.i_size);
-
-       darray_exit(&folios);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       loff_t pos = iocb->ki_pos;
-       ssize_t written = 0;
-       int ret = 0;
-
-       bch2_pagecache_add_get(inode);
-
-       do {
-               unsigned offset = pos & (PAGE_SIZE - 1);
-               unsigned bytes = iov_iter_count(iter);
-again:
-               /*
-                * Bring in the user page that we will copy from _first_.
-                * Otherwise there's a nasty deadlock on copying from the
-                * same page as we're writing to, without it being marked
-                * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
-                */
-               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                       bytes = min_t(unsigned long, iov_iter_count(iter),
-                                     PAGE_SIZE - offset);
-
-                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-               }
-
-               if (unlikely(fatal_signal_pending(current))) {
-                       ret = -EINTR;
-                       break;
-               }
-
-               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-               if (unlikely(ret < 0))
-                       break;
-
-               cond_resched();
-
-               if (unlikely(ret == 0)) {
-                       /*
-                        * If we were unable to copy any data at all, we must
-                        * fall back to a single segment length write.
-                        *
-                        * If we didn't fallback here, we could livelock
-                        * because not all segments in the iov can be copied at
-                        * once without a pagefault.
-                        */
-                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
-                                     iov_iter_single_seg_count(iter));
-                       goto again;
-               }
-               pos += ret;
-               written += ret;
-               ret = 0;
-
-               balance_dirty_pages_ratelimited(mapping);
-       } while (iov_iter_count(iter));
-
-       bch2_pagecache_add_put(inode);
-
-       return written ? written : ret;
-}
-
-/* O_DIRECT reads */
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-       if (check_dirty) {
-               bio_check_pages_dirty(bio);
-       } else {
-               bio_release_pages(bio, false);
-               bio_put(bio);
-       }
-}
-
-static void bch2_dio_read_complete(struct closure *cl)
-{
-       struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
-       dio->req->ki_complete(dio->req, dio->ret);
-       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-
-       if (bio->bi_status)
-               dio->ret = blk_status_to_errno(bio->bi_status);
-
-       closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
-
-       bch2_direct_IO_read_endio(bio);
-       bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-       struct file *file = req->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct dio_read *dio;
-       struct bio *bio;
-       loff_t offset = req->ki_pos;
-       bool sync = is_sync_kiocb(req);
-       size_t shorten;
-       ssize_t ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       if ((offset|iter->count) & (block_bytes(c) - 1))
-               return -EINVAL;
-
-       ret = min_t(loff_t, iter->count,
-                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-       if (!ret)
-               return ret;
-
-       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-       iter->count -= shorten;
-
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_READ,
-                              GFP_KERNEL,
-                              &c->dio_read_bioset);
-
-       bio->bi_end_io = bch2_direct_IO_read_endio;
-
-       dio = container_of(bio, struct dio_read, rbio.bio);
-       closure_init(&dio->cl, NULL);
-
-       /*
-        * this is a _really_ horrible hack just to avoid an atomic sub at the
-        * end:
-        */
-       if (!sync) {
-               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER -
-                          CLOSURE_RUNNING +
-                          CLOSURE_DESTRUCTOR);
-       } else {
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER + 1);
-       }
-
-       dio->req        = req;
-       dio->ret        = ret;
-       /*
-        * This is one of the sketchier things I've encountered: we have to skip
-        * the dirtying of requests that are internal from the kernel (i.e. from
-        * loopback), because we'll deadlock on page_lock.
-        */
-       dio->should_dirty = iter_is_iovec(iter);
-
-       goto start;
-       while (iter->count) {
-               bio = bio_alloc_bioset(NULL,
-                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                                      REQ_OP_READ,
-                                      GFP_KERNEL,
-                                      &c->bio_read);
-               bio->bi_end_io          = bch2_direct_IO_read_split_endio;
-start:
-               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
-               bio->bi_iter.bi_sector  = offset >> 9;
-               bio->bi_private         = dio;
-
-               ret = bio_iov_iter_get_pages(bio, iter);
-               if (ret < 0) {
-                       /* XXX: fault inject this path */
-                       bio->bi_status = BLK_STS_RESOURCE;
-                       bio_endio(bio);
-                       break;
-               }
-
-               offset += bio->bi_iter.bi_size;
-
-               if (dio->should_dirty)
-                       bio_set_pages_dirty(bio);
-
-               if (iter->count)
-                       closure_get(&dio->cl);
-
-               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
-       }
-
-       iter->count += shorten;
-
-       if (sync) {
-               closure_sync(&dio->cl);
-               closure_debug_destroy(&dio->cl);
-               ret = dio->ret;
-               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-               return ret;
-       } else {
-               return -EIOCBQUEUED;
-       }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
-
-       if (!count)
-               return 0; /* skip atime */
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               struct blk_plug plug;
-
-               if (unlikely(mapping->nrpages)) {
-                       ret = filemap_write_and_wait_range(mapping,
-                                               iocb->ki_pos,
-                                               iocb->ki_pos + count - 1);
-                       if (ret < 0)
-                               goto out;
-               }
-
-               file_accessed(file);
-
-               blk_start_plug(&plug);
-               ret = bch2_direct_IO_read(iocb, iter);
-               blk_finish_plug(&plug);
-
-               if (ret >= 0)
-                       iocb->ki_pos += ret;
-       } else {
-               bch2_pagecache_add_get(inode);
-               ret = generic_file_read_iter(iocb, iter);
-               bch2_pagecache_add_put(inode);
-       }
-out:
-       return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-                                      u64 offset, u64 size,
-                                      unsigned nr_replicas, bool compressed)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 end = offset + size;
-       u32 snapshot;
-       bool ret = true;
-       int err;
-
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (err)
-               goto err;
-
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-                          SPOS(inum.inum, offset, snapshot),
-                          BTREE_ITER_SLOTS, k, err) {
-               if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-                       break;
-
-               if (k.k->p.snapshot != snapshot ||
-                   nr_replicas > bch2_bkey_replicas(c, k) ||
-                   (!compressed && bch2_bkey_sectors_compressed(k))) {
-                       ret = false;
-                       break;
-               }
-       }
-
-       offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
-err:
-       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-               goto retry;
-       bch2_trans_exit(&trans);
-
-       return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       return bch2_check_range_allocated(c, inode_inum(inode),
-                               dio->op.pos.offset, bio_sectors(bio),
-                               dio->op.opts.data_replicas,
-                               dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-       struct iovec *iov = dio->inline_vecs;
-
-       /*
-        * iov_iter has a single embedded iovec - nothing to do:
-        */
-       if (iter_is_ubuf(&dio->iter))
-               return 0;
-
-       /*
-        * We don't currently handle non-iovec iov_iters here - return an error,
-        * and we'll fall back to doing the IO synchronously:
-        */
-       if (!iter_is_iovec(&dio->iter))
-               return -1;
-
-       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-                                   GFP_KERNEL);
-               if (unlikely(!iov))
-                       return -ENOMEM;
-
-               dio->free_iov = true;
-       }
-
-       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-       dio->iter.__iov = iov;
-       return 0;
-}
-
-static void bch2_dio_write_flush_done(struct closure *cl)
-{
-       struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
-       struct bch_fs *c = dio->op.c;
-
-       closure_debug_destroy(cl);
-
-       dio->op.error = bch2_journal_error(&c->journal);
-
-       bch2_dio_write_done(dio);
-}
+struct nocow_flush {
+       struct closure  *cl;
+       struct bch_dev  *ca;
+       struct bio      bio;
+};
 
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
+static void nocow_flush_endio(struct bio *_bio)
 {
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_unpacked inode;
-       int ret;
 
-       dio->flush = 0;
-
-       closure_init(&dio->op.cl, NULL);
-
-       if (!dio->op.error) {
-               ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-               if (ret) {
-                       dio->op.error = ret;
-               } else {
-                       bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
-                       bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-               }
-       }
+       struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
 
-       if (dio->sync) {
-               closure_sync(&dio->op.cl);
-               closure_debug_destroy(&dio->op.cl);
-       } else {
-               continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-       }
+       closure_put(bio->cl);
+       percpu_ref_put(&bio->ca->io_ref);
+       bio_put(&bio->bio);
 }
 
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct closure *cl)
 {
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       bool sync = dio->sync;
-       long ret;
-
-       if (unlikely(dio->flush)) {
-               bch2_dio_write_flush(dio);
-               if (!sync)
-                       return -EIOCBQUEUED;
-       }
-
-       bch2_pagecache_block_put(inode);
+       struct nocow_flush *bio;
+       struct bch_dev *ca;
+       struct bch_devs_mask devs;
+       unsigned dev;
 
-       if (dio->free_iov)
-               kfree(dio->iter.__iov);
+       dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
+       if (dev == BCH_SB_MEMBERS_MAX)
+               return;
 
-       ret = dio->op.error ?: ((long) dio->written << 9);
-       bio_put(&dio->op.wbio.bio);
+       devs = inode->ei_devs_need_flush;
+       memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
 
-       /* inode->i_dio_count is our ref on inode and thus bch_fs */
-       inode_dio_end(&inode->v);
+       for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
+               rcu_read_lock();
+               ca = rcu_dereference(c->devs[dev]);
+               if (ca && !percpu_ref_tryget(&ca->io_ref))
+                       ca = NULL;
+               rcu_read_unlock();
 
-       if (ret < 0)
-               ret = bch2_err_class(ret);
+               if (!ca)
+                       continue;
 
-       if (!sync) {
-               req->ki_complete(req, ret);
-               ret = -EIOCBQUEUED;
+               bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
+                                                   REQ_OP_FLUSH,
+                                                   GFP_KERNEL,
+                                                   &c->nocow_flush_bioset),
+                                  struct nocow_flush, bio);
+               bio->cl                 = cl;
+               bio->ca                 = ca;
+               bio->bio.bi_end_io      = nocow_flush_endio;
+               closure_bio_submit(&bio->bio, cl);
        }
-       return ret;
 }
 
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+                                        struct bch_inode_info *inode)
 {
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       req->ki_pos     += (u64) dio->op.written << 9;
-       dio->written    += dio->op.written;
-
-       if (dio->extending) {
-               spin_lock(&inode->v.i_lock);
-               if (req->ki_pos > inode->v.i_size)
-                       i_size_write(&inode->v, req->ki_pos);
-               spin_unlock(&inode->v.i_lock);
-       }
-
-       if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-               __bch2_quota_reservation_put(c, inode, &dio->quota_res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
+       struct closure cl;
 
-       bio_release_pages(bio, false);
+       closure_init_stack(&cl);
+       bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+       closure_sync(&cl);
 
-       if (unlikely(dio->op.error))
-               set_bit(EI_INODE_ERROR, &inode->ei_flags);
+       return 0;
 }
 
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct address_space *mapping = dio->mapping;
-       struct bch_inode_info *inode = dio->inode;
-       struct bch_io_opts opts;
-       struct bio *bio = &dio->op.wbio.bio;
-       unsigned unaligned, iter_count;
-       bool sync = dio->sync, dropped_locks;
-       long ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       while (1) {
-               iter_count = dio->iter.count;
-
-               EBUG_ON(current->faults_disabled_mapping);
-               current->faults_disabled_mapping = mapping;
-
-               ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-               dropped_locks = fdm_dropped_locks();
-
-               current->faults_disabled_mapping = NULL;
-
-               /*
-                * If the fault handler returned an error but also signalled
-                * that it dropped & retook ei_pagecache_lock, we just need to
-                * re-shoot down the page cache and retry:
-                */
-               if (dropped_locks && ret)
-                       ret = 0;
-
-               if (unlikely(ret < 0))
-                       goto err;
-
-               if (unlikely(dropped_locks)) {
-                       ret = write_invalidate_inode_pages_range(mapping,
-                                       req->ki_pos,
-                                       req->ki_pos + iter_count - 1);
-                       if (unlikely(ret))
-                               goto err;
-
-                       if (!bio->bi_iter.bi_size)
-                               continue;
-               }
-
-               unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-               bio->bi_iter.bi_size -= unaligned;
-               iov_iter_revert(&dio->iter, unaligned);
-
-               if (!bio->bi_iter.bi_size) {
-                       /*
-                        * bio_iov_iter_get_pages was only able to get <
-                        * blocksize worth of pages:
-                        */
-                       ret = -EFAULT;
-                       goto err;
-               }
-
-               bch2_write_op_init(&dio->op, c, opts);
-               dio->op.end_io          = sync
-                       ? NULL
-                       : bch2_dio_write_loop_async;
-               dio->op.target          = dio->op.opts.foreground_target;
-               dio->op.write_point     = writepoint_hashed((unsigned long) current);
-               dio->op.nr_replicas     = dio->op.opts.data_replicas;
-               dio->op.subvol          = inode->ei_subvol;
-               dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-               dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
-               if (sync)
-                       dio->op.flags |= BCH_WRITE_SYNC;
-               dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
-
-               ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-                                                bio_sectors(bio), true);
-               if (unlikely(ret))
-                       goto err;
-
-               ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-                                               dio->op.opts.data_replicas, 0);
-               if (unlikely(ret) &&
-                   !bch2_dio_write_check_allocated(dio))
-                       goto err;
-
-               task_io_account_write(bio->bi_iter.bi_size);
-
-               if (unlikely(dio->iter.count) &&
-                   !dio->sync &&
-                   !dio->loop &&
-                   bch2_dio_write_copy_iov(dio))
-                       dio->sync = sync = true;
-
-               dio->loop = true;
-               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-               if (!sync)
-                       return -EIOCBQUEUED;
-
-               bch2_dio_write_end(dio);
-
-               if (likely(!dio->iter.count) || dio->op.error)
-                       break;
-
-               bio_reset(bio, NULL, REQ_OP_WRITE);
-       }
-out:
-       return bch2_dio_write_done(dio);
-err:
-       dio->op.error = ret;
-
-       bio_release_pages(bio, false);
+/* i_size updates: */
 
-       bch2_quota_reservation_put(c, inode, &dio->quota_res);
-       goto out;
-}
+struct inode_new_size {
+       loff_t          new_size;
+       u64             now;
+       unsigned        fields;
+};
 
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+static int inode_set_size(struct bch_inode_info *inode,
+                         struct bch_inode_unpacked *bi,
+                         void *p)
 {
-       struct mm_struct *mm = dio->mm;
+       struct inode_new_size *s = p;
 
-       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+       bi->bi_size = s->new_size;
+       if (s->fields & ATTR_ATIME)
+               bi->bi_atime = s->now;
+       if (s->fields & ATTR_MTIME)
+               bi->bi_mtime = s->now;
+       if (s->fields & ATTR_CTIME)
+               bi->bi_ctime = s->now;
 
-       if (mm)
-               kthread_use_mm(mm);
-       bch2_dio_write_loop(dio);
-       if (mm)
-               kthread_unuse_mm(mm);
+       return 0;
 }
 
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      loff_t new_size, unsigned fields)
 {
-       struct dio_write *dio = container_of(op, struct dio_write, op);
-
-       bch2_dio_write_end(dio);
+       struct inode_new_size s = {
+               .new_size       = new_size,
+               .now            = bch2_current_time(c),
+               .fields         = fields,
+       };
 
-       if (likely(!dio->iter.count) || dio->op.error)
-               bch2_dio_write_done(dio);
-       else
-               bch2_dio_write_continue(dio);
+       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
 }
 
-static noinline
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+                          struct quota_res *quota_res, s64 sectors)
 {
-       struct file *file = req->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct dio_write *dio;
-       struct bio *bio;
-       bool locked = true, extending;
-       ssize_t ret;
-
-       prefetch(&c->opts);
-       prefetch((void *) &c->opts + 64);
-       prefetch(&inode->ei_inode);
-       prefetch((void *) &inode->ei_inode + 64);
-
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(req, iter);
-       if (unlikely(ret <= 0))
-               goto err;
-
-       ret = file_remove_privs(file);
-       if (unlikely(ret))
-               goto err;
-
-       ret = file_update_time(file);
-       if (unlikely(ret))
-               goto err;
-
-       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-               goto err;
-
-       inode_dio_begin(&inode->v);
-       bch2_pagecache_block_get(inode);
+       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+                               inode->ei_inode.bi_sectors);
+       inode->v.i_blocks += sectors;
 
-       extending = req->ki_pos + iter->count > inode->v.i_size;
-       if (!extending) {
-               inode_unlock(&inode->v);
-               locked = false;
-       }
+#ifdef CONFIG_BCACHEFS_QUOTA
+       if (quota_res &&
+           !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+           sectors > 0) {
+               BUG_ON(sectors > quota_res->sectors);
+               BUG_ON(sectors > inode->ei_quota_reserved);
 
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_WRITE,
-                              GFP_KERNEL,
-                              &c->dio_write_bioset);
-       dio = container_of(bio, struct dio_write, op.wbio.bio);
-       dio->req                = req;
-       dio->mapping            = mapping;
-       dio->inode              = inode;
-       dio->mm                 = current->mm;
-       dio->loop               = false;
-       dio->extending          = extending;
-       dio->sync               = is_sync_kiocb(req) || extending;
-       dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-       dio->free_iov           = false;
-       dio->quota_res.sectors  = 0;
-       dio->written            = 0;
-       dio->iter               = *iter;
-       dio->op.c               = c;
-
-       if (unlikely(mapping->nrpages)) {
-               ret = write_invalidate_inode_pages_range(mapping,
-                                               req->ki_pos,
-                                               req->ki_pos + iter->count - 1);
-               if (unlikely(ret))
-                       goto err_put_bio;
+               quota_res->sectors -= sectors;
+               inode->ei_quota_reserved -= sectors;
+       } else {
+               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
        }
-
-       ret = bch2_dio_write_loop(dio);
-err:
-       if (locked)
-               inode_unlock(&inode->v);
-       return ret;
-err_put_bio:
-       bch2_pagecache_block_put(inode);
-       bio_put(bio);
-       inode_dio_end(&inode->v);
-       goto err;
+#endif
 }
 
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       ssize_t ret;
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               ret = bch2_direct_write(iocb, from);
-               goto out;
-       }
-
-       /* We can write back this queue in page reclaim */
-       current->backing_dev_info = inode_to_bdi(&inode->v);
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0)
-               goto unlock;
-
-       ret = file_remove_privs(file);
-       if (ret)
-               goto unlock;
-
-       ret = file_update_time(file);
-       if (ret)
-               goto unlock;
-
-       ret = bch2_buffered_write(iocb, from);
-       if (likely(ret > 0))
-               iocb->ki_pos += ret;
-unlock:
-       inode_unlock(&inode->v);
-       current->backing_dev_info = NULL;
-
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
-out:
-       return bch2_err_class(ret);
-}
 
 /* fsync: */
 
@@ -2911,10 +305,10 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
                s->s[i].nr_replicas     = 0;
 
                i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-               folio_sector_set(folio, s, i, SECTOR_unallocated);
+               bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
        }
 
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
        /*
         * Caller needs to know whether this folio will be written out by
@@ -3105,7 +499,7 @@ int bch2_truncate(struct mnt_idmap *idmap,
        ret = bch2_fpunch(c, inode_inum(inode),
                        round_up(iattr->ia_size, block_bytes(c)) >> 9,
                        U64_MAX, &i_sectors_delta);
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
        bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
                                !bch2_journal_error(&c->journal), c,
@@ -3159,7 +553,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
                ret = bch2_fpunch(c, inode_inum(inode),
                                  block_start >> 9, block_end >> 9,
                                  &i_sectors_delta);
-               i_sectors_acct(c, inode, NULL, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
        }
 
        mutex_lock(&inode->ei_update_lock);
@@ -3210,7 +604,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
        new_size = inode->v.i_size + shift;
 
-       ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+       ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
        if (ret)
                return ret;
 
@@ -3226,7 +620,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                ret = bch2_fpunch(c, inode_inum(inode),
                                  offset >> 9, (offset + len) >> 9,
                                  &i_sectors_delta);
-               i_sectors_acct(c, inode, NULL, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
                if (ret)
                        return ret;
@@ -3410,6 +804,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                }
 
                if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+                       /*
+                        * Lock ordering - can't be holding btree locks while
+                        * blocking on a folio lock:
+                        */
                        if (bch2_clamp_data_hole(&inode->v,
                                                 &hole_start,
                                                 &hole_end,
@@ -3443,10 +841,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                if (ret)
                        goto bkey_err;
 
-               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
                drop_locks_do(&trans,
-                       (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+                       (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -3459,7 +857,7 @@ bkey_err:
 
                bch2_fpunch_at(&trans, &iter, inode_inum(inode),
                               end_sector, &i_sectors_delta);
-               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
                bch2_quota_reservation_put(c, inode, &quota_res);
        }
 
@@ -3653,7 +1051,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
        aligned_len = round_up((u64) len, block_bytes(c));
 
-       ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+       ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
                                pos_dst, pos_dst + len - 1);
        if (ret)
                goto err;
@@ -3665,7 +1063,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
        file_update_time(file_dst);
 
-       mark_pagecache_unallocated(src, pos_src >> 9,
+       bch2_mark_pagecache_unallocated(src, pos_src >> 9,
                                   (pos_src + aligned_len) >> 9);
 
        ret = bch2_remap_range(c,
@@ -3681,7 +1079,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
         */
        ret = min((u64) ret << 9, (u64) len);
 
-       i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+       bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
        spin_lock(&dst->v.i_lock);
        if (pos_dst + ret > dst->v.i_size)
@@ -3700,68 +1098,6 @@ err:
 
 /* fseek: */
 
-static int folio_data_offset(struct folio *folio, loff_t pos,
-                            unsigned min_replicas)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       if (s)
-               for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-                       if (s->s[i].state >= SECTOR_dirty &&
-                           s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-                               return i << SECTOR_SHIFT;
-
-       return -1;
-}
-
-static loff_t bch2_seek_pagecache_data(struct inode *vinode,
-                                      loff_t start_offset,
-                                      loff_t end_offset,
-                                      unsigned min_replicas,
-                                      bool nonblock)
-{
-       struct folio_batch fbatch;
-       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
-       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
-       pgoff_t index           = start_index;
-       unsigned i;
-       loff_t ret;
-       int offset;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(vinode->i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-
-                       if (!nonblock) {
-                               folio_lock(folio);
-                       } else if (!folio_trylock(folio)) {
-                               folio_batch_release(&fbatch);
-                               return -EAGAIN;
-                       }
-
-                       offset = folio_data_offset(folio,
-                                       max(folio_pos(folio), start_offset),
-                                       min_replicas);
-                       if (offset >= 0) {
-                               ret = clamp(folio_pos(folio) + offset,
-                                           start_offset, end_offset);
-                               folio_unlock(folio);
-                               folio_batch_release(&fbatch);
-                               return ret;
-                       }
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       return end_offset;
-}
-
 static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
@@ -3815,88 +1151,6 @@ err:
        return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-                             unsigned min_replicas, bool nonblock)
-{
-       struct folio *folio;
-       struct bch_folio *s;
-       unsigned i, sectors;
-       bool ret = true;
-
-       folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-                                   !nonblock ? FGP_LOCK : 0, 0);
-       if (IS_ERR_OR_NULL(folio))
-               return true;
-
-       if (nonblock && !folio_trylock(folio)) {
-               folio_put(folio);
-               return -EAGAIN;
-       }
-
-       s = bch2_folio(folio);
-       if (!s)
-               goto unlock;
-
-       sectors = folio_sectors(folio);
-       for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-               if (s->s[i].state < SECTOR_dirty ||
-                   s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-                       *offset = max(*offset,
-                                     folio_pos(folio) + (i << SECTOR_SHIFT));
-                       goto unlock;
-               }
-
-       *offset = folio_end_pos(folio);
-       ret = false;
-unlock:
-       folio_unlock(folio);
-       folio_put(folio);
-       return ret;
-}
-
-static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-                                      loff_t start_offset,
-                                      loff_t end_offset,
-                                      unsigned min_replicas,
-                                      bool nonblock)
-{
-       struct address_space *mapping = vinode->i_mapping;
-       loff_t offset = start_offset;
-
-       while (offset < end_offset &&
-              !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
-               ;
-
-       return min(offset, end_offset);
-}
-
-static int bch2_clamp_data_hole(struct inode *inode,
-                               u64 *hole_start,
-                               u64 *hole_end,
-                               unsigned min_replicas,
-                               bool nonblock)
-{
-       loff_t ret;
-
-       ret = bch2_seek_pagecache_hole(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_start = ret;
-
-       if (*hole_start == *hole_end)
-               return 0;
-
-       ret = bch2_seek_pagecache_data(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_end = ret;
-       return 0;
-}
-
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
@@ -3981,28 +1235,10 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 void bch2_fs_fsio_exit(struct bch_fs *c)
 {
        bioset_exit(&c->nocow_flush_bioset);
-       bioset_exit(&c->dio_write_bioset);
-       bioset_exit(&c->dio_read_bioset);
-       bioset_exit(&c->writepage_bioset);
 }
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
-       if (bioset_init(&c->writepage_bioset,
-                       4, offsetof(struct bch_writepage_io, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-       if (bioset_init(&c->dio_read_bioset,
-                       4, offsetof(struct dio_read, rbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-       if (bioset_init(&c->dio_write_bioset,
-                       4, offsetof(struct dio_write, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
        if (bioset_init(&c->nocow_flush_bioset,
                        1, offsetof(struct nocow_flush, bio), 0))
                return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
index af905331542dd99191b43866ee423991764a8e66..c6704618c16ca6f70db077e6c8aa75f519863a2a 100644 (file)
 #ifndef NO_BCACHEFS_FS
 
 #include "buckets.h"
+#include "fs.h"
 #include "io_types.h"
+#include "quota.h"
 
 #include <linux/uio.h>
 
-struct quota_res;
+struct folio_vec {
+       struct folio    *fv_folio;
+       size_t          fv_offset;
+       size_t          fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+       struct folio *folio     = page_folio(bv.bv_page);
+       size_t offset           = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+               bv.bv_offset;
+       size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+       return (struct folio_vec) {
+               .fv_folio       = folio,
+               .fv_offset      = offset,
+               .fv_len         = len,
+       };
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+                                                   struct bvec_iter iter)
+{
+       return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)                    \
+       for (iter = (start);                                            \
+            (iter).bi_size &&                                          \
+               ((bvl = bio_iter_iovec_folio((bio), (iter))), 1);       \
+            bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)                             \
+       __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+       u64                             sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct quota_res *res)
+{
+       BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+                       -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+       inode->ei_quota_reserved -= res->sectors;
+       res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res)
+{
+       if (res->sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_quota_reservation_put(c, inode, res);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     u64 sectors,
+                                     bool check_enospc)
+{
+       int ret;
+
+       if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+               return 0;
+
+       mutex_lock(&inode->ei_quota_lock);
+       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+                             check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+       if (likely(!ret)) {
+               inode->ei_quota_reserved += sectors;
+               res->sectors += sectors;
+       }
+       mutex_unlock(&inode->ei_quota_lock);
+
+       return ret;
+}
 
-int __must_check bch2_write_inode_size(struct bch_fs *,
-                                      struct bch_inode_info *,
-                                      loff_t, unsigned);
+#else
+
+static void __bch2_quota_reservation_put(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct quota_res *res) {}
 
-int bch2_read_folio(struct file *, struct folio *);
+static void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res) {}
 
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
+static int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     unsigned sectors,
+                                     bool check_enospc)
+{
+       return 0;
+}
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-                    unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
-                  unsigned, unsigned, struct page *, void *);
+#endif
 
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+                          struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+                                      struct quota_res *quota_res, s64 sectors)
+{
+       if (sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_i_sectors_acct(c, inode, quota_res, sectors);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+       current->faults_disabled_mapping =
+               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+       return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+                       struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+                                      struct bch_inode_info *,
+                                      loff_t, unsigned);
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
@@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
 
 loff_t bch2_llseek(struct file *, loff_t, int);
 
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
 void bch2_fs_fsio_exit(struct bch_fs *);
 int bch2_fs_fsio_init(struct bch_fs *);
 #else
index 8d2f388b4327db8196e884866a40590371b86a5f..0b550a97097560a90c51bb498ef314f9568c9469 100644 (file)
@@ -14,6 +14,8 @@
 #include "fs-common.h"
 #include "fs-io.h"
 #include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
@@ -203,7 +205,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 
        if (ret) {
                iget_failed(&inode->v);
-               return ERR_PTR(ret);
+               return ERR_PTR(bch2_err_class(ret));
        }
 
        mutex_lock(&c->vfs_inodes_lock);
@@ -1000,11 +1002,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret;
 
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       return bch2_readdir(c, inode_inum(inode), ctx);
+       ret = bch2_readdir(c, inode_inum(inode), ctx);
+       if (ret)
+               bch_err_fn(c, ret);
+
+       return bch2_err_class(ret);
 }
 
 static const struct file_operations bch_file_operations = {
index 0852dbe988ad1fb5bc6cdcd8f2da821c287017a1..d99c04af2c5514ea92afd750b9be84d43f0b04b5 100644 (file)
@@ -11,6 +11,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "subvolume.h"
 #include "super.h"
 #include "xattr.h"
index fea21e1e5721e302504a751443835e3b91941590..e0d416553bf0b00ed2bb7eb112c9a2909afc6624 100644 (file)
@@ -348,6 +348,8 @@ int bch2_inode_peek(struct btree_trans *trans,
        return 0;
 err:
        bch2_trans_iter_exit(trans, iter);
+       if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
        return ret;
 }
 
@@ -520,23 +522,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        __bch2_inode_unpacked_to_text(out, &inode);
 }
 
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
 {
        switch (k.k->type) {
        case KEY_TYPE_inode:
-               return bkey_s_c_to_inode(k).v->bi_flags &
-                       cpu_to_le32(BCH_INODE_UNLINKED);
+               return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
        case KEY_TYPE_inode_v2:
-               return bkey_s_c_to_inode_v2(k).v->bi_flags &
-                       cpu_to_le32(BCH_INODE_UNLINKED);
+               return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
        case KEY_TYPE_inode_v3:
-               return bkey_s_c_to_inode_v3(k).v->bi_flags &
-                       cpu_to_le64(BCH_INODE_UNLINKED);
+               return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
        default:
-               return false;
+               return 0;
        }
 }
 
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+       return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
+}
+
 int bch2_trans_mark_inode(struct btree_trans *trans,
                          enum btree_id btree_id, unsigned level,
                          struct bkey_s_c old,
index f861ae2f176a8d0d87467250272b472cdc466c94..d141c749e39bc2c5cff0315402b647809fe0c0a2 100644 (file)
@@ -14,6 +14,7 @@
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "trace.h"
 
 static struct nonce journal_nonce(const struct jset *jset)
@@ -208,33 +209,41 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_BAD      7
 
 static void journal_entry_err_msg(struct printbuf *out,
+                                 u32 version,
                                  struct jset *jset,
                                  struct jset_entry *entry)
 {
-       prt_str(out, "invalid journal entry ");
-       if (entry)
-               prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
-
-       if (!jset)
-               prt_printf(out, "in superblock");
-       else if (!entry)
-               prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
-       else
-               prt_printf(out, "at offset %zi/%u seq %llu",
-                          (u64 *) entry - jset->_data,
-                          le32_to_cpu(jset->u64s),
-                          le64_to_cpu(jset->seq));
+       prt_str(out, "invalid journal entry, version=");
+       bch2_version_to_text(out, version);
+
+       if (entry) {
+               prt_str(out, " type=");
+               prt_str(out, bch2_jset_entry_types[entry->type]);
+       }
+
+       if (!jset) {
+               prt_printf(out, " in superblock");
+       } else {
+
+               prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+               if (entry)
+                       prt_printf(out, " offset=%zi/%u",
+                                  (u64 *) entry - jset->_data,
+                                  le32_to_cpu(jset->u64s));
+       }
+
        prt_str(out, ": ");
 }
 
-#define journal_entry_err(c, jset, entry, msg, ...)                    \
+#define journal_entry_err(c, version, jset, entry, msg, ...)           \
 ({                                                                     \
        struct printbuf buf = PRINTBUF;                                 \
                                                                        \
-       journal_entry_err_msg(&buf, jset, entry);                       \
+       journal_entry_err_msg(&buf, version, jset, entry);              \
        prt_printf(&buf, msg, ##__VA_ARGS__);                           \
                                                                        \
-       switch (write) {                                                \
+       switch (flags & BKEY_INVALID_WRITE) {                           \
        case READ:                                                      \
                mustfix_fsck_err(c, "%s", buf.buf);                     \
                break;                                                  \
@@ -251,8 +260,8 @@ static void journal_entry_err_msg(struct printbuf *out,
        true;                                                           \
 })
 
-#define journal_entry_err_on(cond, c, jset, entry, msg, ...)           \
-       ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...)  \
+       ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
 
 #define FSCK_DELETED_KEY       5
 
@@ -261,13 +270,15 @@ static int journal_validate_key(struct bch_fs *c,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
                                struct bkey_i *k,
-                               unsigned version, int big_endian, int write)
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
+       int write = flags & BKEY_INVALID_WRITE;
        void *next = vstruct_next(entry);
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
-       if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
+       if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
                return FSCK_DELETED_KEY;
@@ -275,7 +286,7 @@ static int journal_validate_key(struct bch_fs *c,
 
        if (journal_entry_err_on((void *) bkey_next(k) >
                                 (void *) vstruct_next(entry),
-                                c, jset, entry,
+                                c, version, jset, entry,
                                 "extends past end of journal entry")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
@@ -283,7 +294,7 @@ static int journal_validate_key(struct bch_fs *c,
        }
 
        if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-                                c, jset, entry,
+                                c, version, jset, entry,
                                 "bad format %u", k->k.format)) {
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -298,11 +309,7 @@ static int journal_validate_key(struct bch_fs *c,
        if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
                              __btree_node_type(level, btree_id), write, &buf)) {
                printbuf_reset(&buf);
-               prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
-                          bch2_jset_entry_types[entry->type],
-                          (u64 *) entry - jset->_data,
-                          le32_to_cpu(jset->u64s),
-                          le64_to_cpu(jset->seq));
+               journal_entry_err_msg(&buf, version, jset, entry);
                prt_newline(&buf);
                printbuf_indent_add(&buf, 2);
 
@@ -312,6 +319,7 @@ static int journal_validate_key(struct bch_fs *c,
                                  __btree_node_type(level, btree_id), write, &buf);
 
                mustfix_fsck_err(c, "%s", buf.buf);
+               BUG();
 
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -330,9 +338,10 @@ fsck_err:
 }
 
 static int journal_entry_btree_keys_validate(struct bch_fs *c,
-                                            struct jset *jset,
-                                            struct jset_entry *entry,
-                                            unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct bkey_i *k = entry->start;
 
@@ -341,7 +350,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
                                               entry->level,
                                               entry->btree_id,
                                               k, version, big_endian,
-                                              write|BKEY_INVALID_JOURNAL);
+                                              flags|BKEY_INVALID_JOURNAL);
                if (ret == FSCK_DELETED_KEY)
                        continue;
 
@@ -369,16 +378,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
-                                            struct jset *jset,
-                                            struct jset_entry *entry,
-                                            unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct bkey_i *k = entry->start;
        int ret = 0;
 
        if (journal_entry_err_on(!entry->u64s ||
                                 le16_to_cpu(entry->u64s) != k->k.u64s,
-                                c, jset, entry,
+                                c, version, jset, entry,
                                 "invalid btree root journal entry: wrong number of keys")) {
                void *next = vstruct_next(entry);
                /*
@@ -392,7 +402,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
        }
 
        return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-                                   version, big_endian, write);
+                                   version, big_endian, flags);
 fsck_err:
        return ret;
 }
@@ -404,9 +414,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        /* obsolete, don't care: */
        return 0;
@@ -418,14 +429,15 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        int ret = 0;
 
        if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-                                c, jset, entry,
+                                c, version, jset, entry,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -443,15 +455,16 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-                                              struct jset *jset,
-                                              struct jset_entry *entry,
-                                              unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_blacklist_v2 *bl_entry;
        int ret = 0;
 
        if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-                                c, jset, entry,
+                                c, version, jset, entry,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                goto out;
@@ -461,7 +474,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 
        if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
                                 le64_to_cpu(bl_entry->end),
-                                c, jset, entry,
+                                c, version, jset, entry,
                "invalid journal seq blacklist entry: start > end")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -482,9 +495,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
 }
 
 static int journal_entry_usage_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_usage *u =
                container_of(entry, struct jset_entry_usage, entry);
@@ -492,7 +506,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < sizeof(*u),
-                                c, jset, entry,
+                                c, version, jset, entry,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -514,9 +528,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_data_usage *u =
                container_of(entry, struct jset_entry_data_usage, entry);
@@ -525,7 +540,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 
        if (journal_entry_err_on(bytes < sizeof(*u) ||
                                 bytes < sizeof(*u) + u->r.nr_devs,
-                                c, jset, entry,
+                                c, version, jset, entry,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -546,9 +561,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_clock *clock =
                container_of(entry, struct jset_entry_clock, entry);
@@ -556,13 +572,13 @@ static int journal_entry_clock_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes != sizeof(*clock),
-                                c, jset, entry, "bad size")) {
+                                c, version, jset, entry, "bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(clock->rw > 1,
-                                c, jset, entry, "bad rw")) {
+                                c, version, jset, entry, "bad rw")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -581,9 +597,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_dev_usage *u =
                container_of(entry, struct jset_entry_dev_usage, entry);
@@ -593,7 +610,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < expected,
-                                c, jset, entry, "bad size (%u < %u)",
+                                c, version, jset, entry, "bad size (%u < %u)",
                                 bytes, expected)) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -602,13 +619,13 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        dev = le32_to_cpu(u->dev);
 
        if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-                                c, jset, entry, "bad dev")) {
+                                c, version, jset, entry, "bad dev")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(u->pad,
-                                c, jset, entry, "bad pad")) {
+                                c, version, jset, entry, "bad pad")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -641,9 +658,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
-                                     struct jset *jset,
-                                     struct jset_entry *entry,
-                                     unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return 0;
 }
@@ -658,9 +676,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_overwrite_validate(struct bch_fs *c,
-                                     struct jset *jset,
-                                     struct jset_entry *entry,
-                                     unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return journal_entry_btree_keys_validate(c, jset, entry,
                                version, big_endian, READ);
@@ -674,7 +693,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
 
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
-                       struct jset_entry *, unsigned, int, int);
+                       struct jset_entry *, unsigned, int,
+                       enum bkey_invalid_flags);
        void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
@@ -691,11 +711,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 int bch2_journal_entry_validate(struct bch_fs *c,
                                struct jset *jset,
                                struct jset_entry *entry,
-                               unsigned version, int big_endian, int write)
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return entry->type < BCH_JSET_ENTRY_NR
                ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-                               version, big_endian, write)
+                               version, big_endian, flags)
                : 0;
 }
 
@@ -711,22 +732,22 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-                                int write)
+                                enum bkey_invalid_flags flags)
 {
        struct jset_entry *entry;
+       unsigned version = le32_to_cpu(jset->version);
        int ret = 0;
 
        vstruct_for_each(jset, entry) {
-               if (journal_entry_err_on(vstruct_next(entry) >
-                                        vstruct_last(jset), c, jset, entry,
+               if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+                                        c, version, jset, entry,
                                "journal entry extends past end of jset")) {
                        jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
                        break;
                }
 
                ret = bch2_journal_entry_validate(c, jset, entry,
-                                       le32_to_cpu(jset->version),
-                                       JSET_BIG_ENDIAN(jset), write);
+                                       version, JSET_BIG_ENDIAN(jset), flags);
                if (ret)
                        break;
        }
@@ -737,7 +758,7 @@ fsck_err:
 static int jset_validate(struct bch_fs *c,
                         struct bch_dev *ca,
                         struct jset *jset, u64 sector,
-                        int write)
+                        enum bkey_invalid_flags flags)
 {
        unsigned version;
        int ret = 0;
@@ -746,7 +767,8 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_NONE;
 
        version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+       if (journal_entry_err_on(!bch2_version_compatible(version),
+                       c, version, jset, NULL,
                        "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -757,7 +779,7 @@ static int jset_validate(struct bch_fs *c,
        }
 
        if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-                                c, jset, NULL,
+                                c, version, jset, NULL,
                        "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -767,7 +789,7 @@ static int jset_validate(struct bch_fs *c,
        /* last_seq is ignored when JSET_NO_FLUSH is true */
        if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
                                 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-                                c, jset, NULL,
+                                c, version, jset, NULL,
                                 "invalid journal entry: last_seq > seq (%llu > %llu)",
                                 le64_to_cpu(jset->last_seq),
                                 le64_to_cpu(jset->seq))) {
@@ -775,7 +797,7 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_BAD;
        }
 
-       ret = jset_validate_entries(c, jset, write);
+       ret = jset_validate_entries(c, jset, flags);
 fsck_err:
        return ret;
 }
@@ -788,14 +810,15 @@ static int jset_validate_early(struct bch_fs *c,
 {
        size_t bytes = vstruct_bytes(jset);
        unsigned version;
-       int write = READ;
+       enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
        int ret = 0;
 
        if (le64_to_cpu(jset->magic) != jset_magic(c))
                return JOURNAL_ENTRY_NONE;
 
        version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+       if (journal_entry_err_on(!bch2_version_compatible(version),
+                        c, version, jset, NULL,
                        "%s sector %llu seq %llu: unknown journal entry version %u.%u",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -810,7 +833,7 @@ static int jset_validate_early(struct bch_fs *c,
                return JOURNAL_ENTRY_REREAD;
 
        if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-                                c, jset, NULL,
+                        c, version, jset, NULL,
                        "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq), bytes))
@@ -1127,7 +1150,7 @@ int bch2_journal_read(struct bch_fs *c,
         * those entries will be blacklisted:
         */
        genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-               int write = READ;
+               enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 
                i = *_i;
 
@@ -1149,7 +1172,7 @@ int bch2_journal_read(struct bch_fs *c,
                }
 
                if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-                                        c, &i->j, NULL,
+                                        c, le32_to_cpu(i->j.version), &i->j, NULL,
                                         "invalid journal entry: last_seq > seq (%llu > %llu)",
                                         le64_to_cpu(i->j.last_seq),
                                         le64_to_cpu(i->j.seq)))
index 8801e98104bd8aaa9671b41f89ee4e904169388d..a88d097b13f1294a5ca1f3c30ebba5282ef56da3 100644 (file)
@@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
                jset_entry_for_each_key(entry, k)
 
 int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-                               struct jset_entry *, unsigned, int, int);
+                               struct jset_entry *, unsigned, int,
+                               enum bkey_invalid_flags);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
                                struct jset_entry *);
 
index 8de83e10375187803730214e3bee4c3cd425256c..7a3139b578abc033ef0e0a34fb4a4600551308cd 100644 (file)
@@ -3,13 +3,14 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
-#include "super.h"
+#include "sb-members.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
index 5242f20bb680fc2c0d9fc8ebd54188db87dfa8ab..256431a6dc0caf502b11808877b625848048ad75 100644 (file)
@@ -220,8 +220,10 @@ static int bch2_copygc(struct btree_trans *trans,
 
                f = move_bucket_in_flight_add(buckets_in_flight, *i);
                ret = PTR_ERR_OR_ZERO(f);
-               if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+               if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+                       ret = 0;
                        continue;
+               }
                if (ret == -ENOMEM) { /* flush IO, continue later */
                        ret = 0;
                        break;
index 55a233c2c7cc7b7f8122b66c3cd32fc057658c8d..cb2186a7d64e03f785421c137100c940b35c48e7 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -23,6 +24,7 @@
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "subvolume.h"
 #include "super-io.h"
 
@@ -57,524 +59,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
                        bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
 }
 
-/* iterate over keys read from the journal: */
-
-static int __journal_key_cmp(enum btree_id     l_btree_id,
-                            unsigned           l_level,
-                            struct bpos        l_pos,
-                            const struct journal_key *r)
-{
-       return (cmp_int(l_btree_id,     r->btree_id) ?:
-               cmp_int(l_level,        r->level) ?:
-               bpos_cmp(l_pos, r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-       size_t gap_size = keys->size - keys->nr;
-
-       if (idx >= keys->gap)
-               idx += gap_size;
-       return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-       return keys->d + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-                                       enum btree_id id, unsigned level,
-                                       struct bpos pos)
-{
-       size_t l = 0, r = keys->nr, m;
-
-       while (l < r) {
-               m = l + ((r - l) >> 1);
-               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-                       l = m + 1;
-               else
-                       r = m;
-       }
-
-       BUG_ON(l < keys->nr &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-       BUG_ON(l &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-       return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-                                     enum btree_id id, unsigned level,
-                                     struct bpos pos)
-{
-       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos,
-                                          struct bpos end_pos, size_t *idx)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       unsigned iters = 0;
-       struct journal_key *k;
-search:
-       if (!*idx)
-               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-       while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-               if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-                       return NULL;
-
-               if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-                   !k->overwritten)
-                       return k->k;
-
-               (*idx)++;
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       goto search;
-               }
-       }
-
-       return NULL;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos)
-{
-       size_t idx = 0;
-
-       return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       /* The key we just inserted is immediately before the gap: */
-       size_t gap_end = keys->gap + (keys->size - keys->nr);
-       struct btree_and_journal_iter *iter;
-
-       /*
-        * If an iterator points one after the key we just inserted, decrement
-        * the iterator so it points at the key we just inserted - if the
-        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-        * handle that:
-        */
-       list_for_each_entry(iter, &c->journal_iters, journal.list)
-               if (iter->journal.idx == gap_end)
-                       iter->journal.idx = keys->gap - 1;
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       struct journal_iter *iter;
-       size_t gap_size = keys->size - keys->nr;
-
-       list_for_each_entry(iter, &c->journal_iters, list) {
-               if (iter->idx > old_gap)
-                       iter->idx -= gap_size;
-               if (iter->idx >= new_gap)
-                       iter->idx += gap_size;
-       }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-                                unsigned level, struct bkey_i *k)
-{
-       struct journal_key n = {
-               .btree_id       = id,
-               .level          = level,
-               .k              = k,
-               .allocated      = true,
-               /*
-                * Ensure these keys are done last by journal replay, to unblock
-                * journal reclaim:
-                */
-               .journal_seq    = U32_MAX,
-       };
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-       BUG_ON(test_bit(BCH_FS_RW, &c->flags));
-
-       if (idx < keys->size &&
-           journal_key_cmp(&n, &keys->d[idx]) == 0) {
-               if (keys->d[idx].allocated)
-                       kfree(keys->d[idx].k);
-               keys->d[idx] = n;
-               return 0;
-       }
-
-       if (idx > keys->gap)
-               idx -= keys->size - keys->nr;
-
-       if (keys->nr == keys->size) {
-               struct journal_keys new_keys = {
-                       .nr                     = keys->nr,
-                       .size                   = max_t(size_t, keys->size, 8) * 2,
-               };
-
-               new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
-               if (!new_keys.d) {
-                       bch_err(c, "%s: error allocating new key array (size %zu)",
-                               __func__, new_keys.size);
-                       return -BCH_ERR_ENOMEM_journal_key_insert;
-               }
-
-               /* Since @keys was full, there was no gap: */
-               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
-               kvfree(keys->d);
-               *keys = new_keys;
-
-               /* And now the gap is at the end: */
-               keys->gap = keys->nr;
-       }
-
-       journal_iters_move_gap(c, keys->gap, idx);
-
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
-       keys->gap = idx;
-
-       keys->nr++;
-       keys->d[keys->gap++] = n;
-
-       journal_iters_fix(c);
-
-       return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bkey_i *k)
-{
-       struct bkey_i *n;
-       int ret;
-
-       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-       if (!n)
-               return -BCH_ERR_ENOMEM_journal_key_insert;
-
-       bkey_copy(n, k);
-       ret = bch2_journal_key_insert_take(c, id, level, n);
-       if (ret)
-               kfree(n);
-       return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bpos pos)
-{
-       struct bkey_i whiteout;
-
-       bkey_init(&whiteout.k);
-       whiteout.k.p = pos;
-
-       return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-                                 unsigned level, struct bpos pos)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-       if (idx < keys->size &&
-           keys->d[idx].btree_id       == btree &&
-           keys->d[idx].level          == level &&
-           bpos_eq(keys->d[idx].k->k.p, pos))
-               keys->d[idx].overwritten = true;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-       if (iter->idx < iter->keys->size) {
-               iter->idx++;
-               if (iter->idx == iter->keys->gap)
-                       iter->idx += iter->keys->size - iter->keys->nr;
-       }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-       struct journal_key *k = iter->keys->d + iter->idx;
-
-       while (k < iter->keys->d + iter->keys->size &&
-              k->btree_id      == iter->btree_id &&
-              k->level         == iter->level) {
-               if (!k->overwritten)
-                       return bkey_i_to_s_c(k->k);
-
-               bch2_journal_iter_advance(iter);
-               k = iter->keys->d + iter->idx;
-       }
-
-       return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-       list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-                                  struct journal_iter *iter,
-                                  enum btree_id id, unsigned level,
-                                  struct bpos pos)
-{
-       iter->btree_id  = id;
-       iter->level     = level;
-       iter->keys      = &c->journal_keys;
-       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-                                               iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-       if (bpos_eq(iter->pos, SPOS_MAX))
-               iter->at_end = true;
-       else
-               iter->pos = bpos_successor(iter->pos);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-       struct bkey_s_c btree_k, journal_k, ret;
-again:
-       if (iter->at_end)
-               return bkey_s_c_null;
-
-       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-              bpos_lt(btree_k.k->p, iter->pos))
-               bch2_journal_iter_advance_btree(iter);
-
-       while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-              bpos_lt(journal_k.k->p, iter->pos))
-               bch2_journal_iter_advance(&iter->journal);
-
-       ret = journal_k.k &&
-               (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-               ? journal_k
-               : btree_k;
-
-       if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-               ret = bkey_s_c_null;
-
-       if (ret.k) {
-               iter->pos = ret.k->p;
-               if (bkey_deleted(ret.k)) {
-                       bch2_btree_and_journal_iter_advance(iter);
-                       goto again;
-               }
-       } else {
-               iter->pos = SPOS_MAX;
-               iter->at_end = true;
-       }
-
-       return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-       bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                                 struct bch_fs *c,
-                                                 struct btree *b,
-                                                 struct btree_node_iter node_iter,
-                                                 struct bpos pos)
-{
-       memset(iter, 0, sizeof(*iter));
-
-       iter->b = b;
-       iter->node_iter = node_iter;
-       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
-       INIT_LIST_HEAD(&iter->journal.list);
-       iter->pos = b->data->min_key;
-       iter->at_end = false;
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct bch_fs *c,
-                                               struct btree *b)
-{
-       struct btree_node_iter node_iter;
-
-       bch2_btree_node_iter_init_from_start(&node_iter, b);
-       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-       list_add(&iter->journal.list, &c->journal_iters);
-}
-
-/* sort and dedup all keys in the journal: */
-
-void bch2_journal_entries_free(struct bch_fs *c)
-{
-       struct journal_replay **i;
-       struct genradix_iter iter;
-
-       genradix_for_each(&c->journal_entries, iter, i)
-               if (*i)
-                       kvpfree(*i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&(*i)->j));
-       genradix_free(&c->journal_entries);
-}
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-       const struct journal_key *l = _l;
-       const struct journal_key *r = _r;
-
-       return  journal_key_cmp(l, r) ?:
-               cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_free(struct journal_keys *keys)
-{
-       struct journal_key *i;
-
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-       keys->gap = keys->nr;
-
-       for (i = keys->d; i < keys->d + keys->nr; i++)
-               if (i->allocated)
-                       kfree(i->k);
-
-       kvfree(keys->d);
-       keys->d = NULL;
-       keys->nr = keys->gap = keys->size = 0;
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-       struct journal_key *src, *dst;
-
-       sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
-
-       src = dst = keys->d;
-       while (src < keys->d + keys->nr) {
-               while (src + 1 < keys->d + keys->nr &&
-                      src[0].btree_id  == src[1].btree_id &&
-                      src[0].level     == src[1].level &&
-                      bpos_eq(src[0].k->k.p, src[1].k->k.p))
-                       src++;
-
-               *dst++ = *src++;
-       }
-
-       keys->nr = dst - keys->d;
-}
-
-static int journal_keys_sort(struct bch_fs *c)
-{
-       struct genradix_iter iter;
-       struct journal_replay *i, **_i;
-       struct jset_entry *entry;
-       struct bkey_i *k;
-       struct journal_keys *keys = &c->journal_keys;
-       size_t nr_keys = 0, nr_read = 0;
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (!i || i->ignore)
-                       continue;
-
-               for_each_jset_key(k, entry, &i->j)
-                       nr_keys++;
-       }
-
-       if (!nr_keys)
-               return 0;
-
-       keys->size = roundup_pow_of_two(nr_keys);
-
-       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-       if (!keys->d) {
-               bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
-                       nr_keys);
-
-               do {
-                       keys->size >>= 1;
-                       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-               } while (!keys->d && keys->size > nr_keys / 8);
-
-               if (!keys->d) {
-                       bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
-                               keys->size);
-                       return -BCH_ERR_ENOMEM_journal_keys_sort;
-               }
-       }
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (!i || i->ignore)
-                       continue;
-
-               cond_resched();
-
-               for_each_jset_key(k, entry, &i->j) {
-                       if (keys->nr == keys->size) {
-                               __journal_keys_sort(keys);
-
-                               if (keys->nr > keys->size * 7 / 8) {
-                                       bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
-                                               keys->nr, keys->size, nr_read, nr_keys);
-                                       return -BCH_ERR_ENOMEM_journal_keys_sort;
-                               }
-                       }
-
-                       keys->d[keys->nr++] = (struct journal_key) {
-                               .btree_id       = entry->btree_id,
-                               .level          = entry->level,
-                               .k              = k,
-                               .journal_seq    = le64_to_cpu(i->j.seq),
-                               .journal_offset = k->_data - i->j._data,
-                       };
-
-                       nr_read++;
-               }
-       }
-
-       __journal_keys_sort(keys);
-       keys->gap = keys->nr;
-
-       bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
-       return 0;
-}
-
 /* journal replay: */
 
 static void replay_now_at(struct journal *j, u64 seq)
@@ -846,134 +330,6 @@ static int journal_replay_early(struct bch_fs *c,
 
 /* sb clean section: */
 
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-                                     struct bch_sb_field_clean *clean,
-                                     struct jset *j,
-                                     enum btree_id id, unsigned *level)
-{
-       struct bkey_i *k;
-       struct jset_entry *entry, *start, *end;
-
-       if (clean) {
-               start = clean->start;
-               end = vstruct_end(&clean->field);
-       } else {
-               start = j->start;
-               end = vstruct_last(j);
-       }
-
-       for (entry = start; entry < end; entry = vstruct_next(entry))
-               if (entry->type == BCH_JSET_ENTRY_btree_root &&
-                   entry->btree_id == id)
-                       goto found;
-
-       return NULL;
-found:
-       if (!entry->u64s)
-               return ERR_PTR(-EINVAL);
-
-       k = entry->start;
-       *level = entry->level;
-       return k;
-}
-
-static int verify_superblock_clean(struct bch_fs *c,
-                                  struct bch_sb_field_clean **cleanp,
-                                  struct jset *j)
-{
-       unsigned i;
-       struct bch_sb_field_clean *clean = *cleanp;
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-       int ret = 0;
-
-       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-                       le64_to_cpu(clean->journal_seq),
-                       le64_to_cpu(j->seq))) {
-               kfree(clean);
-               *cleanp = NULL;
-               return 0;
-       }
-
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               struct bkey_i *k1, *k2;
-               unsigned l1 = 0, l2 = 0;
-
-               k1 = btree_root_find(c, clean, NULL, i, &l1);
-               k2 = btree_root_find(c, NULL, j, i, &l2);
-
-               if (!k1 && !k2)
-                       continue;
-
-               printbuf_reset(&buf1);
-               printbuf_reset(&buf2);
-
-               if (k1)
-                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-               else
-                       prt_printf(&buf1, "(none)");
-
-               if (k2)
-                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-               else
-                       prt_printf(&buf2, "(none)");
-
-               mustfix_fsck_err_on(!k1 || !k2 ||
-                                   IS_ERR(k1) ||
-                                   IS_ERR(k2) ||
-                                   k1->k.u64s != k2->k.u64s ||
-                                   memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-                                   l1 != l2, c,
-                       "superblock btree root %u doesn't match journal after clean shutdown\n"
-                       "sb:      l=%u %s\n"
-                       "journal: l=%u %s\n", i,
-                       l1, buf1.buf,
-                       l2, buf2.buf);
-       }
-fsck_err:
-       printbuf_exit(&buf2);
-       printbuf_exit(&buf1);
-       return ret;
-}
-
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *clean, *sb_clean;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
-       if (fsck_err_on(!sb_clean, c,
-                       "superblock marked clean but clean section not present")) {
-               SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-               c->sb.clean = false;
-               mutex_unlock(&c->sb_lock);
-               return NULL;
-       }
-
-       clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-                       GFP_KERNEL);
-       if (!clean) {
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-       }
-
-       ret = bch2_sb_clean_validate_late(c, clean, READ);
-       if (ret) {
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(ret);
-       }
-
-       mutex_unlock(&c->sb_lock);
-
-       return clean;
-fsck_err:
-       mutex_unlock(&c->sb_lock);
-       return ERR_PTR(ret);
-}
-
 static bool btree_id_is_alloc(enum btree_id id)
 {
        switch (id) {
@@ -1120,6 +476,35 @@ static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
        return ret;
 }
 
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, _when)  #_fn,
+       BCH_RECOVERY_PASSES()
+#undef x
+       NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+       return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+       return 0;
+}
+
+struct recovery_pass_fn {
+       int             (*fn)(struct bch_fs *);
+       unsigned        when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _when)  { .fn = bch2_##_fn, .when = _when },
+       BCH_RECOVERY_PASSES()
+#undef x
+};
+
 static void check_version_upgrade(struct bch_fs *c)
 {
        unsigned latest_compatible = bch2_version_compatible(c->sb.version);
@@ -1172,7 +557,12 @@ static void check_version_upgrade(struct bch_fs *c)
 
                recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
                if (recovery_passes) {
-                       prt_str(&buf, "fsck required");
+                       if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+                               prt_str(&buf, "fsck required");
+                       else {
+                               prt_str(&buf, "running recovery passses: ");
+                               prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+                       }
 
                        c->recovery_passes_explicit |= recovery_passes;
                        c->opts.fix_errors = FSCK_FIX_yes;
@@ -1188,42 +578,19 @@ static void check_version_upgrade(struct bch_fs *c)
        }
 }
 
-static int bch2_check_allocations(struct bch_fs *c)
-{
-       return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-       return 0;
-}
-
-struct recovery_pass_fn {
-       int             (*fn)(struct bch_fs *);
-       const char      *name;
-       unsigned        when;
-};
-
-static struct recovery_pass_fn recovery_passes[] = {
-#define x(_fn, _when)  { .fn = bch2_##_fn, .name = #_fn, .when = _when },
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
 u64 bch2_fsck_recovery_passes(void)
 {
        u64 ret = 0;
 
-       for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
-               if (recovery_passes[i].when & PASS_FSCK)
+       for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+               if (recovery_pass_fns[i].when & PASS_FSCK)
                        ret |= BIT_ULL(i);
        return ret;
 }
 
 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
-       struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
+       struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
 
        if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
                return false;
@@ -1245,15 +612,18 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
        c->curr_recovery_pass = pass;
 
        if (should_run_recovery_pass(c, pass)) {
-               struct recovery_pass_fn *p = recovery_passes + pass;
+               struct recovery_pass_fn *p = recovery_pass_fns + pass;
 
                if (!(p->when & PASS_SILENT))
-                       printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
+                       printk(KERN_INFO bch2_log_msg(c, "%s..."),
+                              bch2_recovery_passes[pass]);
                ret = p->fn(c);
                if (ret)
                        return ret;
                if (!(p->when & PASS_SILENT))
                        printk(KERN_CONT " done\n");
+
+               c->recovery_passes_complete |= BIT_ULL(pass);
        }
 
        return 0;
@@ -1263,7 +633,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c)
 {
        int ret = 0;
 
-       while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
+       while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
                ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
                if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
                        continue;
@@ -1283,17 +653,17 @@ int bch2_fs_recovery(struct bch_fs *c)
        bool write_sb = false;
        int ret = 0;
 
-       if (c->sb.clean)
-               clean = read_superblock_clean(c);
-       ret = PTR_ERR_OR_ZERO(clean);
-       if (ret)
-               goto err;
+       if (c->sb.clean) {
+               clean = bch2_read_superblock_clean(c);
+               ret = PTR_ERR_OR_ZERO(clean);
+               if (ret)
+                       goto err;
 
-       if (c->sb.clean)
                bch_info(c, "recovering from clean shutdown, journal seq %llu",
                         le64_to_cpu(clean->journal_seq));
-       else
+       } else {
                bch_info(c, "recovering from unclean shutdown");
+       }
 
        if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
                bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -1308,12 +678,6 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        }
 
-       if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-               bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
-               ret = -EINVAL;
-               goto err;
-       }
-
        if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
                check_version_upgrade(c);
 
@@ -1373,12 +737,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                                }
                }
 
-               ret = journal_keys_sort(c);
+               ret = bch2_journal_keys_sort(c);
                if (ret)
                        goto err;
 
                if (c->sb.clean && last_journal_entry) {
-                       ret = verify_superblock_clean(c, &clean,
+                       ret = bch2_verify_superblock_clean(c, &clean,
                                                      last_journal_entry);
                        if (ret)
                                goto err;
@@ -1513,7 +877,6 @@ use_clean:
        mutex_unlock(&c->sb_lock);
 
        if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
            c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
                struct bch_move_stats stats;
 
@@ -1581,7 +944,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        }
        mutex_unlock(&c->sb_lock);
 
-       c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
+       c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
        set_bit(BCH_FS_MAY_GO_RW, &c->flags);
        set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
index f8e796c0f8c86880f6f9b63017f4c0ef4f879ecc..852d30567da9c4079c2f42a71b00e2d5de2c03e0 100644 (file)
@@ -2,55 +2,28 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-struct journal_iter {
-       struct list_head        list;
-       enum btree_id           btree_id;
-       unsigned                level;
-       size_t                  idx;
-       struct journal_keys     *keys;
-};
+extern const char * const bch2_recovery_passes[];
 
 /*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ * For when we need to rewind recovery passes and run a pass we skipped:
  */
-
-struct btree_and_journal_iter {
-       struct btree            *b;
-       struct btree_node_iter  node_iter;
-       struct bkey             unpacked;
-
-       struct journal_iter     journal;
-       struct bpos             pos;
-       bool                    at_end;
-};
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-                               unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-                                          unsigned, struct bpos);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-                                unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-                           unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-                           unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
-                                 unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                               struct bch_fs *, struct btree *,
-                               struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                                               struct bch_fs *,
-                                               struct btree *);
-
-void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct bch_fs *);
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+                                                 enum bch_recovery_pass pass)
+{
+       bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+                bch2_recovery_passes[pass], pass,
+                bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+       c->recovery_passes_explicit |= BIT_ULL(pass);
+
+       if (c->curr_recovery_pass >= pass) {
+               c->curr_recovery_pass = pass;
+               c->recovery_passes_complete &= (1ULL << pass) >> 1;
+               return -BCH_ERR_restart_recovery;
+       } else {
+               return 0;
+       }
+}
 
 u64 bch2_fsck_recovery_passes(void);
 
diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c
new file mode 100644 (file)
index 0000000..a3695e5
--- /dev/null
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+                               int write)
+{
+       struct jset_entry *entry;
+       int ret;
+
+       for (entry = clean->start;
+            entry < (struct jset_entry *) vstruct_end(&clean->field);
+            entry = vstruct_next(entry)) {
+               ret = bch2_journal_entry_validate(c, NULL, entry,
+                                                 le16_to_cpu(c->disk_sb.sb->version),
+                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+                                                 write);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+                                     struct bch_sb_field_clean *clean,
+                                     struct jset *j,
+                                     enum btree_id id, unsigned *level)
+{
+       struct bkey_i *k;
+       struct jset_entry *entry, *start, *end;
+
+       if (clean) {
+               start = clean->start;
+               end = vstruct_end(&clean->field);
+       } else {
+               start = j->start;
+               end = vstruct_last(j);
+       }
+
+       for (entry = start; entry < end; entry = vstruct_next(entry))
+               if (entry->type == BCH_JSET_ENTRY_btree_root &&
+                   entry->btree_id == id)
+                       goto found;
+
+       return NULL;
+found:
+       if (!entry->u64s)
+               return ERR_PTR(-EINVAL);
+
+       k = entry->start;
+       *level = entry->level;
+       return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+                                struct bch_sb_field_clean **cleanp,
+                                struct jset *j)
+{
+       unsigned i;
+       struct bch_sb_field_clean *clean = *cleanp;
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       int ret = 0;
+
+       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+                       le64_to_cpu(clean->journal_seq),
+                       le64_to_cpu(j->seq))) {
+               kfree(clean);
+               *cleanp = NULL;
+               return 0;
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               struct bkey_i *k1, *k2;
+               unsigned l1 = 0, l2 = 0;
+
+               k1 = btree_root_find(c, clean, NULL, i, &l1);
+               k2 = btree_root_find(c, NULL, j, i, &l2);
+
+               if (!k1 && !k2)
+                       continue;
+
+               printbuf_reset(&buf1);
+               printbuf_reset(&buf2);
+
+               if (k1)
+                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+               else
+                       prt_printf(&buf1, "(none)");
+
+               if (k2)
+                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+               else
+                       prt_printf(&buf2, "(none)");
+
+               mustfix_fsck_err_on(!k1 || !k2 ||
+                                   IS_ERR(k1) ||
+                                   IS_ERR(k2) ||
+                                   k1->k.u64s != k2->k.u64s ||
+                                   memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+                                   l1 != l2, c,
+                       "superblock btree root %u doesn't match journal after clean shutdown\n"
+                       "sb:      l=%u %s\n"
+                       "journal: l=%u %s\n", i,
+                       l1, buf1.buf,
+                       l2, buf2.buf);
+       }
+fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
+       return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+       struct bch_sb_field_clean *clean, *sb_clean;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+
+       if (fsck_err_on(!sb_clean, c,
+                       "superblock marked clean but clean section not present")) {
+               SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+               c->sb.clean = false;
+               mutex_unlock(&c->sb_lock);
+               return NULL;
+       }
+
+       clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+                       GFP_KERNEL);
+       if (!clean) {
+               mutex_unlock(&c->sb_lock);
+               return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+       }
+
+       ret = bch2_sb_clean_validate_late(c, clean, READ);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ERR_PTR(ret);
+       }
+
+       mutex_unlock(&c->sb_lock);
+
+       return clean;
+fsck_err:
+       mutex_unlock(&c->sb_lock);
+       return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+       memset(entry, 0, u64s * sizeof(u64));
+       /*
+        * The u64s field counts from the start of data, ignoring the shared
+        * fields.
+        */
+       entry->u64s = cpu_to_le16(u64s - 1);
+
+       *end = vstruct_next(*end);
+       return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
+{
+       struct bch_dev *ca;
+       unsigned i, dev;
+
+       percpu_down_read(&c->mark_lock);
+
+       if (!journal_seq) {
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       bch2_fs_usage_acc_to_base(c, i);
+       } else {
+               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+       }
+
+       {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_inodes;
+               u->v            = cpu_to_le64(c->usage_base->nr_inodes);
+       }
+
+       {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_key_version;
+               u->v            = cpu_to_le64(atomic64_read(&c->key_version));
+       }
+
+       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_reserved;
+               u->entry.level  = i;
+               u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
+       }
+
+       for (i = 0; i < c->replicas.nr; i++) {
+               struct bch_replicas_entry *e =
+                       cpu_replicas_entry(&c->replicas, i);
+               struct jset_entry_data_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_data_usage;
+               u->v            = cpu_to_le64(c->usage_base->replicas[i]);
+               unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+                             "embedded variable length struct");
+       }
+
+       for_each_member_device(ca, c, dev) {
+               unsigned b = sizeof(struct jset_entry_dev_usage) +
+                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+               struct jset_entry_dev_usage *u =
+                       container_of(jset_entry_init(end, b),
+                                    struct jset_entry_dev_usage, entry);
+
+               u->entry.type = BCH_JSET_ENTRY_dev_usage;
+               u->dev = cpu_to_le32(dev);
+               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+               }
+       }
+
+       percpu_up_read(&c->mark_lock);
+
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+       }
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+               prt_printf(err, "wrong size (got %zu should be %zu)",
+                      vstruct_bytes(&clean->field), sizeof(*clean));
+               return -BCH_ERR_invalid_sb_clean;
+       }
+
+       return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+       struct jset_entry *entry;
+
+       prt_printf(out, "flags:          %x",   le32_to_cpu(clean->flags));
+       prt_newline(out);
+       prt_printf(out, "journal_seq:    %llu", le64_to_cpu(clean->journal_seq));
+       prt_newline(out);
+
+       for (entry = clean->start;
+            entry != vstruct_end(&clean->field);
+            entry = vstruct_next(entry)) {
+               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+                   !entry->u64s)
+                       continue;
+
+               bch2_journal_entry_to_text(out, NULL, entry);
+               prt_newline(out);
+       }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+       .validate       = bch2_sb_clean_validate,
+       .to_text        = bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+       int ret;
+
+       /*
+        * Unconditionally write superblock, to verify it hasn't changed before
+        * we go rw:
+        */
+
+       mutex_lock(&c->sb_lock);
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+       bch2_sb_maybe_downgrade(c);
+       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+       ret = bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+       struct bch_sb_field_clean *sb_clean;
+       struct jset_entry *entry;
+       unsigned u64s;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       if (BCH_SB_CLEAN(c->disk_sb.sb))
+               goto out;
+
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+       u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+       sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+       if (!sb_clean) {
+               bch_err(c, "error resizing superblock while setting filesystem clean");
+               goto out;
+       }
+
+       sb_clean->flags         = 0;
+       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
+
+       /* Trying to catch outstanding bug: */
+       BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+       entry = sb_clean->start;
+       bch2_journal_super_entries_add_common(c, &entry, 0);
+       entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
+       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+       memset(entry, 0,
+              vstruct_end(&sb_clean->field) - (void *) entry);
+
+       /*
+        * this should be in the write path, and we should be validating every
+        * superblock section:
+        */
+       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+       if (ret) {
+               bch_err(c, "error writing marking filesystem clean: validate error");
+               goto out;
+       }
+
+       bch2_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcachefs/sb-clean.h b/libbcachefs/sb-clean.h
new file mode 100644 (file)
index 0000000..71caef2
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+                                struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
new file mode 100644 (file)
index 0000000..16a2b33
--- /dev/null
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+/* Code for bch_sb_field_members: */
+
+static int bch2_sb_members_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_members *mi = field_to_type(f, members);
+       unsigned i;
+
+       if ((void *) (mi->members + sb->nr_devices) >
+           vstruct_end(&mi->field)) {
+               prt_printf(err, "too many devices for section size");
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+                       prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+                              i, le64_to_cpu(m->nbuckets), LONG_MAX);
+                       return -BCH_ERR_invalid_sb_members;
+               }
+
+               if (le64_to_cpu(m->nbuckets) -
+                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+                       prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+                              i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+                       return -BCH_ERR_invalid_sb_members;
+               }
+
+               if (le16_to_cpu(m->bucket_size) <
+                   le16_to_cpu(sb->block_size)) {
+                       prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+                              i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+                       return -BCH_ERR_invalid_sb_members;
+               }
+
+               if (le16_to_cpu(m->bucket_size) <
+                   BCH_SB_BTREE_NODE_SIZE(sb)) {
+                       prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+                              i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+                       return -BCH_ERR_invalid_sb_members;
+               }
+       }
+
+       return 0;
+}
+
+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_members *mi = field_to_type(f, members);
+       struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
+       unsigned i;
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+               unsigned data_have = bch2_sb_dev_has_data(sb, i);
+               u64 bucket_size = le16_to_cpu(m->bucket_size);
+               u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               prt_printf(out, "Device:");
+               prt_tab(out);
+               prt_printf(out, "%u", i);
+               prt_newline(out);
+
+               printbuf_indent_add(out, 2);
+
+               prt_printf(out, "UUID:");
+               prt_tab(out);
+               pr_uuid(out, m->uuid.b);
+               prt_newline(out);
+
+               prt_printf(out, "Size:");
+               prt_tab(out);
+               prt_units_u64(out, device_size << 9);
+               prt_newline(out);
+
+               prt_printf(out, "Bucket size:");
+               prt_tab(out);
+               prt_units_u64(out, bucket_size << 9);
+               prt_newline(out);
+
+               prt_printf(out, "First bucket:");
+               prt_tab(out);
+               prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
+               prt_newline(out);
+
+               prt_printf(out, "Buckets:");
+               prt_tab(out);
+               prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
+               prt_newline(out);
+
+               prt_printf(out, "Last mount:");
+               prt_tab(out);
+               if (m->last_mount)
+                       pr_time(out, le64_to_cpu(m->last_mount));
+               else
+                       prt_printf(out, "(never)");
+               prt_newline(out);
+
+               prt_printf(out, "State:");
+               prt_tab(out);
+               prt_printf(out, "%s",
+                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
+                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
+                      : "unknown");
+               prt_newline(out);
+
+               prt_printf(out, "Label:");
+               prt_tab(out);
+               if (BCH_MEMBER_GROUP(m)) {
+                       unsigned idx = BCH_MEMBER_GROUP(m) - 1;
+
+                       if (idx < disk_groups_nr(gi))
+                               prt_printf(out, "%s (%u)",
+                                      gi->entries[idx].label, idx);
+                       else
+                               prt_printf(out, "(bad disk labels section)");
+               } else {
+                       prt_printf(out, "(none)");
+               }
+               prt_newline(out);
+
+               prt_printf(out, "Data allowed:");
+               prt_tab(out);
+               if (BCH_MEMBER_DATA_ALLOWED(m))
+                       prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
+               else
+                       prt_printf(out, "(none)");
+               prt_newline(out);
+
+               prt_printf(out, "Has data:");
+               prt_tab(out);
+               if (data_have)
+                       prt_bitflags(out, bch2_data_types, data_have);
+               else
+                       prt_printf(out, "(none)");
+               prt_newline(out);
+
+               prt_printf(out, "Discard:");
+               prt_tab(out);
+               prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
+               prt_newline(out);
+
+               prt_printf(out, "Freespace initialized:");
+               prt_tab(out);
+               prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+               prt_newline(out);
+
+               printbuf_indent_sub(out, 2);
+       }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members = {
+       .validate       = bch2_sb_members_validate,
+       .to_text        = bch2_sb_members_to_text,
+};
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
new file mode 100644 (file)
index 0000000..34e1cf6
--- /dev/null
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+       return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+       return bch2_dev_is_online(ca) &&
+               ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+       if (!percpu_ref_tryget(&ca->io_ref))
+               return false;
+
+       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+               return true;
+
+       percpu_ref_put(&ca->io_ref);
+       return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+                                        unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs.nr; i++)
+               if (devs.devs[i] == dev)
+                       return true;
+
+       return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+                                         unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs->nr; i++)
+               if (devs->devs[i] == dev) {
+                       array_remove_item(devs->devs, devs->nr, i);
+                       return;
+               }
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+                                        unsigned dev)
+{
+       if (!bch2_dev_list_has_dev(*devs, dev)) {
+               BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+               devs->devs[devs->nr++] = dev;
+       }
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+                                             const struct bch_devs_mask *mask)
+{
+       struct bch_dev *ca = NULL;
+
+       while ((*iter = mask
+               ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+               : *iter) < c->sb.nr_devices &&
+              !(ca = rcu_dereference_check(c->devs[*iter],
+                                           lockdep_is_held(&c->state_lock))))
+               (*iter)++;
+
+       return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask)                  \
+       for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       if ((ca = __bch2_next_dev(c, iter, NULL)))
+               percpu_ref_get(&ca->ref);
+       rcu_read_unlock();
+
+       return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)                            \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_dev(c, &(iter)));                      \
+            percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+                                                     unsigned *iter,
+                                                     int state_mask)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+              (!((1 << ca->mi.state) & state_mask) ||
+               !percpu_ref_tryget(&ca->io_ref)))
+               (*iter)++;
+       rcu_read_unlock();
+
+       return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)              \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_online_dev(c, &(iter), state_mask));   \
+            percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)                            \
+       __for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)                                        \
+       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter)                          \
+       __for_each_online_member(ca, c, iter,                           \
+               (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_protected(c->devs[idx],
+                                        lockdep_is_held(&c->sb_lock) ||
+                                        lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+       struct bch_devs_mask devs;
+       struct bch_dev *ca;
+       unsigned i;
+
+       memset(&devs, 0, sizeof(devs));
+       for_each_online_member(ca, c, i)
+               __set_bit(ca->dev_idx, devs.d);
+       return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members;
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
index c9a5a7cb97cf842f3afacdc1846464bc444fdd7c..862702880d35e6e09b9b221a4975b01e6a49279e 100644 (file)
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
 #include "checksum.h"
 #include "counters.h"
 #include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
-#include "journal_io.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "quota.h"
+#include "sb-clean.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "super.h"
 #include "trace.h"
@@ -1005,235 +1004,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
        mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_members: */
-
-static int bch2_sb_members_validate(struct bch_sb *sb,
-                                   struct bch_sb_field *f,
-                                   struct printbuf *err)
-{
-       struct bch_sb_field_members *mi = field_to_type(f, members);
-       unsigned i;
-
-       if ((void *) (mi->members + sb->nr_devices) >
-           vstruct_end(&mi->field)) {
-               prt_printf(err, "too many devices for section size");
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-                       prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
-                              i, le64_to_cpu(m->nbuckets), LONG_MAX);
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le64_to_cpu(m->nbuckets) -
-                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-                       prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-                              i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le16_to_cpu(m->bucket_size) <
-                   le16_to_cpu(sb->block_size)) {
-                       prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-                              i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le16_to_cpu(m->bucket_size) <
-                   BCH_SB_BTREE_NODE_SIZE(sb)) {
-                       prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-                              i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-                       return -BCH_ERR_invalid_sb_members;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
-                                   struct bch_sb_field *f)
-{
-       struct bch_sb_field_members *mi = field_to_type(f, members);
-       struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-       unsigned i;
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               unsigned data_have = bch2_sb_dev_has_data(sb, i);
-               u64 bucket_size = le16_to_cpu(m->bucket_size);
-               u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               prt_printf(out, "Device:");
-               prt_tab(out);
-               prt_printf(out, "%u", i);
-               prt_newline(out);
-
-               printbuf_indent_add(out, 2);
-
-               prt_printf(out, "UUID:");
-               prt_tab(out);
-               pr_uuid(out, m->uuid.b);
-               prt_newline(out);
-
-               prt_printf(out, "Size:");
-               prt_tab(out);
-               prt_units_u64(out, device_size << 9);
-               prt_newline(out);
-
-               prt_printf(out, "Bucket size:");
-               prt_tab(out);
-               prt_units_u64(out, bucket_size << 9);
-               prt_newline(out);
-
-               prt_printf(out, "First bucket:");
-               prt_tab(out);
-               prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
-               prt_newline(out);
-
-               prt_printf(out, "Buckets:");
-               prt_tab(out);
-               prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
-               prt_newline(out);
-
-               prt_printf(out, "Last mount:");
-               prt_tab(out);
-               if (m->last_mount)
-                       pr_time(out, le64_to_cpu(m->last_mount));
-               else
-                       prt_printf(out, "(never)");
-               prt_newline(out);
-
-               prt_printf(out, "State:");
-               prt_tab(out);
-               prt_printf(out, "%s",
-                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
-                      : "unknown");
-               prt_newline(out);
-
-               prt_printf(out, "Label:");
-               prt_tab(out);
-               if (BCH_MEMBER_GROUP(m)) {
-                       unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-                       if (idx < disk_groups_nr(gi))
-                               prt_printf(out, "%s (%u)",
-                                      gi->entries[idx].label, idx);
-                       else
-                               prt_printf(out, "(bad disk labels section)");
-               } else {
-                       prt_printf(out, "(none)");
-               }
-               prt_newline(out);
-
-               prt_printf(out, "Data allowed:");
-               prt_tab(out);
-               if (BCH_MEMBER_DATA_ALLOWED(m))
-                       prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
-               else
-                       prt_printf(out, "(none)");
-               prt_newline(out);
-
-               prt_printf(out, "Has data:");
-               prt_tab(out);
-               if (data_have)
-                       prt_bitflags(out, bch2_data_types, data_have);
-               else
-                       prt_printf(out, "(none)");
-               prt_newline(out);
-
-               prt_printf(out, "Discard:");
-               prt_tab(out);
-               prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
-               prt_newline(out);
-
-               prt_printf(out, "Freespace initialized:");
-               prt_tab(out);
-               prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-               prt_newline(out);
-
-               printbuf_indent_sub(out, 2);
-       }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-       .validate       = bch2_sb_members_validate,
-       .to_text        = bch2_sb_members_to_text,
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
-                                 struct bch_sb_field *f,
-                                 struct printbuf *err)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&crypt->field), sizeof(*crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       if (BCH_CRYPT_KDF_TYPE(crypt)) {
-               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
-       prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-       .validate       = bch2_sb_crypt_validate,
-       .to_text        = bch2_sb_crypt_to_text,
-};
-
-/* BCH_SB_FIELD_clean: */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-{
-       struct jset_entry *entry;
-       int ret;
-
-       for (entry = clean->start;
-            entry < (struct jset_entry *) vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               ret = bch2_journal_entry_validate(c, NULL, entry,
-                                                 le16_to_cpu(c->disk_sb.sb->version),
-                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-                                                 write);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
 /* Downgrade if superblock is at a higher version than currently supported: */
 void bch2_sb_maybe_downgrade(struct bch_fs *c)
 {
@@ -1260,232 +1030,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
        c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 }
 
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
-       int ret;
-
-       /*
-        * Unconditionally write superblock, to verify it hasn't changed before
-        * we go rw:
-        */
-
-       mutex_lock(&c->sb_lock);
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-       bch2_sb_maybe_downgrade(c);
-       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-       ret = bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-       struct jset_entry *entry = *end;
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-       memset(entry, 0, u64s * sizeof(u64));
-       /*
-        * The u64s field counts from the start of data, ignoring the shared
-        * fields.
-        */
-       entry->u64s = cpu_to_le16(u64s - 1);
-
-       *end = vstruct_next(*end);
-       return entry;
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                          struct jset_entry **end,
-                                          u64 journal_seq)
-{
-       struct bch_dev *ca;
-       unsigned i, dev;
-
-       percpu_down_read(&c->mark_lock);
-
-       if (!journal_seq) {
-               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-                       bch2_fs_usage_acc_to_base(c, i);
-       } else {
-               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
-       }
-
-       {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_inodes;
-               u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-       }
-
-       {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_key_version;
-               u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-       }
-
-       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_reserved;
-               u->entry.level  = i;
-               u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-       }
-
-       for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
-                       cpu_replicas_entry(&c->replicas, i);
-               struct jset_entry_data_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
-                                    struct jset_entry_data_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_data_usage;
-               u->v            = cpu_to_le64(c->usage_base->replicas[i]);
-               unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-                             "embedded variable length struct");
-       }
-
-       for_each_member_device(ca, c, dev) {
-               unsigned b = sizeof(struct jset_entry_dev_usage) +
-                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
-               struct jset_entry_dev_usage *u =
-                       container_of(jset_entry_init(end, b),
-                                    struct jset_entry_dev_usage, entry);
-
-               u->entry.type = BCH_JSET_ENTRY_dev_usage;
-               u->dev = cpu_to_le32(dev);
-               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
-
-               for (i = 0; i < BCH_DATA_NR; i++) {
-                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
-                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
-                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
-               }
-       }
-
-       percpu_up_read(&c->mark_lock);
-
-       for (i = 0; i < 2; i++) {
-               struct jset_entry_clock *clock =
-                       container_of(jset_entry_init(end, sizeof(*clock)),
-                                    struct jset_entry_clock, entry);
-
-               clock->entry.type = BCH_JSET_ENTRY_clock;
-               clock->rw       = i;
-               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-       }
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *sb_clean;
-       struct jset_entry *entry;
-       unsigned u64s;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       if (BCH_SB_CLEAN(c->disk_sb.sb))
-               goto out;
-
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
-       u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
-       sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-       if (!sb_clean) {
-               bch_err(c, "error resizing superblock while setting filesystem clean");
-               goto out;
-       }
-
-       sb_clean->flags         = 0;
-       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
-
-       /* Trying to catch outstanding bug: */
-       BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
-       entry = sb_clean->start;
-       bch2_journal_super_entries_add_common(c, &entry, 0);
-       entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
-       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
-       memset(entry, 0,
-              vstruct_end(&sb_clean->field) - (void *) entry);
-
-       /*
-        * this should be in the write path, and we should be validating every
-        * superblock section:
-        */
-       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-       if (ret) {
-               bch_err(c, "error writing marking filesystem clean: validate error");
-               goto out;
-       }
-
-       bch2_write_super(c);
-out:
-       mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb,
-                                 struct bch_sb_field *f,
-                                 struct printbuf *err)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&clean->field), sizeof(*clean));
-               return -BCH_ERR_invalid_sb_clean;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-       struct jset_entry *entry;
-
-       prt_printf(out, "flags:          %x",   le32_to_cpu(clean->flags));
-       prt_newline(out);
-       prt_printf(out, "journal_seq:    %llu", le64_to_cpu(clean->journal_seq));
-       prt_newline(out);
-
-       for (entry = clean->start;
-            entry != vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-                   !entry->u64s)
-                       continue;
-
-               bch2_journal_entry_to_text(out, NULL, entry);
-               prt_newline(out);
-       }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-       .validate       = bch2_sb_clean_validate,
-       .to_text        = bch2_sb_clean_to_text,
-};
-
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #define x(f, nr)                                       \
        [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
index 904adea6a0da20e9699a920c00e909fca4703ac5..5181dcb3a4d08ea5f3e3a4c4061cec7db87cad03 100644 (file)
@@ -121,19 +121,9 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
        };
 }
 
-/* BCH_SB_FIELD_clean: */
-
-void bch2_journal_super_entries_add_common(struct bch_fs *,
-                                          struct jset_entry **, u64);
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-
 void bch2_sb_maybe_downgrade(struct bch_fs *);
 void bch2_sb_upgrade(struct bch_fs *, unsigned);
 
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
                           struct bch_sb_field *);
 void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
index eee56969c77934a7788b5e8083f1dc69d61cab6b..82e992b35dcd878832e89cbf7d4a850a4547906d 100644 (file)
@@ -13,6 +13,7 @@
 #include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -30,6 +31,8 @@
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
@@ -44,6 +47,7 @@
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
@@ -469,6 +473,8 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_fs_counters_exit(c);
        bch2_fs_snapshots_exit(c);
        bch2_fs_quota_exit(c);
+       bch2_fs_fs_io_direct_exit(c);
+       bch2_fs_fs_io_buffered_exit(c);
        bch2_fs_fsio_exit(c);
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
@@ -844,7 +850,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_encryption_init(c) ?:
            bch2_fs_compress_init(c) ?:
            bch2_fs_ec_init(c) ?:
-           bch2_fs_fsio_init(c);
+           bch2_fs_fsio_init(c) ?:
+           bch2_fs_fs_io_buffered_init(c);
+           bch2_fs_fs_io_direct_init(c);
        if (ret)
                goto err;
 
@@ -2000,6 +2008,7 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
+__maybe_unused
 static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 module_param_named(version, bch2_metadata_version, uint, 0400);
 
index 36bcb9ec2b3ad9c07a8a4f6e0f1154d30c38b0ad..bf762df18012b1a1b463724d665551506fc74384 100644 (file)
@@ -8,220 +8,6 @@
 
 #include <linux/math64.h>
 
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-       return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-       return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-       u32 remainder;
-
-       div_u64_rem(s, ca->mi.bucket_size, &remainder);
-       return remainder;
-}
-
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
-                                                u32 *offset)
-{
-       return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-       return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-{
-       return bch2_dev_is_online(ca) &&
-               ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
-       if (!percpu_ref_tryget(&ca->io_ref))
-               return false;
-
-       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-               return true;
-
-       percpu_ref_put(&ca->io_ref);
-       return false;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-                                        unsigned dev)
-{
-       unsigned i;
-
-       for (i = 0; i < devs.nr; i++)
-               if (devs.devs[i] == dev)
-                       return true;
-
-       return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-                                         unsigned dev)
-{
-       unsigned i;
-
-       for (i = 0; i < devs->nr; i++)
-               if (devs->devs[i] == dev) {
-                       array_remove_item(devs->devs, devs->nr, i);
-                       return;
-               }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-                                        unsigned dev)
-{
-       if (!bch2_dev_list_has_dev(*devs, dev)) {
-               BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
-               devs->devs[devs->nr++] = dev;
-       }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-                                             const struct bch_devs_mask *mask)
-{
-       struct bch_dev *ca = NULL;
-
-       while ((*iter = mask
-               ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
-               : *iter) < c->sb.nr_devices &&
-              !(ca = rcu_dereference_check(c->devs[*iter],
-                                           lockdep_is_held(&c->state_lock))))
-               (*iter)++;
-
-       return ca;
-}
-
-#define for_each_member_device_rcu(ca, c, iter, mask)                  \
-       for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
-       struct bch_dev *ca;
-
-       rcu_read_lock();
-       if ((ca = __bch2_next_dev(c, iter, NULL)))
-               percpu_ref_get(&ca->ref);
-       rcu_read_unlock();
-
-       return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter)                            \
-       for ((iter) = 0;                                                \
-            (ca = bch2_get_next_dev(c, &(iter)));                      \
-            percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-                                                     unsigned *iter,
-                                                     int state_mask)
-{
-       struct bch_dev *ca;
-
-       rcu_read_lock();
-       while ((ca = __bch2_next_dev(c, iter, NULL)) &&
-              (!((1 << ca->mi.state) & state_mask) ||
-               !percpu_ref_tryget(&ca->io_ref)))
-               (*iter)++;
-       rcu_read_unlock();
-
-       return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask)              \
-       for ((iter) = 0;                                                \
-            (ca = bch2_get_next_online_dev(c, &(iter), state_mask));   \
-            percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter)                            \
-       __for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter)                                        \
-       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
-
-#define for_each_readable_member(ca, c, iter)                          \
-       __for_each_online_member(ca, c, iter,                           \
-               (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
-
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-{
-       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-       return rcu_dereference_check(c->devs[idx], 1);
-}
-
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-{
-       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-       return rcu_dereference_protected(c->devs[idx],
-                                        lockdep_is_held(&c->sb_lock) ||
-                                        lockdep_is_held(&c->state_lock));
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
-       struct bch_devs_mask devs;
-       struct bch_dev *ca;
-       unsigned i;
-
-       memset(&devs, 0, sizeof(devs));
-       for_each_online_member(ca, c, i)
-               __set_bit(ca->dev_idx, devs.d);
-       return devs;
-}
-
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-       u64 b_offset    = bucket_to_sector(ca, b);
-       u64 b_end       = bucket_to_sector(ca, b + 1);
-       unsigned i;
-
-       if (!b)
-               return true;
-
-       for (i = 0; i < layout->nr_superblocks; i++) {
-               u64 offset = le64_to_cpu(layout->sb_offset[i]);
-               u64 end = offset + (1 << layout->sb_max_size_bits);
-
-               if (!(offset >= b_end || end <= b_offset))
-                       return true;
-       }
-
-       return false;
-}
-
 struct bch_fs *bch2_dev_to_fs(dev_t);
 struct bch_fs *bch2_uuid_to_fs(__uuid_t);