]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 14ce2a2031 bcachefs: fixes for building in userspace
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 21 Dec 2017 23:00:30 +0000 (18:00 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 21 Dec 2017 23:06:45 +0000 (18:06 -0500)
61 files changed:
.bcachefs_revision
cmd_debug.c
cmd_fsck.c
cmd_migrate.c
include/linux/bio.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/bug.h
include/linux/time64.h
libbcachefs/acl.c
libbcachefs/alloc.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/buckets.c
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/checksum.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/migrate.h
libbcachefs/move.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/sysfs.c
libbcachefs/tier.c
libbcachefs/util.h
libbcachefs/vstructs.h
libbcachefs/xattr.c
linux/bio.c
linux/blkdev.c

index 77247162b86c12ba1dd5ec9ce676ad98ccb4c05b..038384588f1120e59a586e64b9380c1c3b4bf94c 100644 (file)
@@ -1 +1 @@
-e57b5958cf4e8530d26f7c36a6e1427fb284cc70
+14ce2a2031f3761a4b957aa2e5aac446ce18b87c
index b1bdda8c3e9174cc0b3acea82a69aaf4aaf1b96b..1a2c1dbd195c83b699d1cbcb6b81435ae2b83beb 100644 (file)
@@ -293,11 +293,11 @@ int cmd_list(int argc, char *argv[])
                                                list_modes, "list mode");
                        break;
                case 'f':
-                       opts.fix_errors = FSCK_ERR_YES;
-                       opts.norecovery = false;
+                       opt_set(opts, fix_errors, FSCK_OPT_YES);
+                       opt_set(opts, norecovery, false);
                        break;
                case 'v':
-                       opts.verbose_recovery = true;
+                       opt_set(opts, verbose_recovery, true);
                        break;
                case 'h':
                        list_keys_usage();
index 9b01524a01c583beb6bfa0a86037e577ffbcb574..556a4e1bfc73c56e428d854604e16ae780167904 100644 (file)
@@ -28,18 +28,19 @@ int cmd_fsck(int argc, char *argv[])
        int opt;
 
        opt_set(opts, degraded, true);
+       opt_set(opts, fix_errors, FSCK_OPT_ASK);
 
        while ((opt = getopt(argc, argv, "pynfvh")) != -1)
                switch (opt) {
                case 'p':
-                       opt_set(opts, fix_errors, FSCK_ERR_YES);
+                       opt_set(opts, fix_errors, FSCK_OPT_YES);
                        break;
                case 'y':
-                       opt_set(opts, fix_errors, FSCK_ERR_YES);
+                       opt_set(opts, fix_errors, FSCK_OPT_YES);
                        break;
                case 'n':
                        opt_set(opts, nochanges, true);
-                       opt_set(opts, fix_errors, FSCK_ERR_NO);
+                       opt_set(opts, fix_errors, FSCK_OPT_NO);
                        break;
                case 'f':
                        /* force check, even if filesystem marked clean: */
index 58c0bb96ae02647e25987aee5bd4728d91ca867a..f46a09dd25d4b5d37a8b50b20a5f637e76a39f6b 100644 (file)
@@ -164,7 +164,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
        struct bch_inode_unpacked new_inode;
        int ret;
 
-       bch2_inode_init(c, &new_inode, uid, gid, mode, rdev);
+       bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);
 
        ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
                                &c->unused_inode_hint);
@@ -247,7 +247,6 @@ static void write_data(struct bch_fs *c,
                       struct bch_inode_unpacked *dst_inode,
                       u64 dst_offset, void *buf, size_t len)
 {
-       struct disk_reservation res;
        struct bch_write_op op;
        struct bio_vec bv;
        struct closure cl;
@@ -261,12 +260,15 @@ static void write_data(struct bch_fs *c,
        op.wbio.bio.bi_iter.bi_size = len;
        bch2_bio_map(&op.wbio.bio, buf);
 
-       int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
+       bch2_write_op_init(&op, c);
+
+       op.write_point  = writepoint_hashed(0);
+       op.pos          = POS(dst_inode->bi_inum, dst_offset >> 9);
+
+       int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0);
        if (ret)
                die("error reserving space in new filesystem: %s", strerror(-ret));
 
-       bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
-                          POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
        closure_call(&op.cl, bch2_write, NULL, &cl);
        closure_sync(&cl);
 
index 10cad5ccf74643bebe63fb39ebe3aab310167ade..7293eef00f830933c6cfd61c1092ab7dfa0a6ee6 100644 (file)
@@ -243,7 +243,8 @@ static inline void bioset_free(struct bio_set *bs)
 
 static inline int bioset_init(struct bio_set *bs,
                              unsigned pool_size,
-                             unsigned front_pad)
+                             unsigned front_pad,
+                             int flags)
 {
        bs->front_pad = front_pad;
        return 0;
@@ -251,6 +252,10 @@ static inline int bioset_init(struct bio_set *bs,
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+enum {
+       BIOSET_NEED_BVECS       = 1 << 0,
+       BIOSET_NEED_RESCUER     = 1 << 1,
+};
 
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
@@ -271,13 +276,6 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask)
 }
 
 extern void bio_endio(struct bio *);
-extern void bio_endio_nodec(struct bio *);
-
-static inline void bio_io_error(struct bio *bio)
-{
-       bio->bi_error = -EIO;
-       bio_endio(bio);
-}
 
 extern void bio_advance(struct bio *, unsigned);
 
index 551799446476f19b90b724342f642142d041966d..42cd003227d430ad4102f50ce1f40c8fb880a7d6 100644 (file)
@@ -13,7 +13,27 @@ struct bio_set;
 struct bio;
 struct block_device;
 typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
+
+/*
+ * Block error status values.  See block/blk-core:blk_errors for the details.
+ */
+typedef u8 __bitwise blk_status_t;
+#define        BLK_STS_OK 0
+#define BLK_STS_NOTSUPP                ((__force blk_status_t)1)
+#define BLK_STS_TIMEOUT                ((__force blk_status_t)2)
+#define BLK_STS_NOSPC          ((__force blk_status_t)3)
+#define BLK_STS_TRANSPORT      ((__force blk_status_t)4)
+#define BLK_STS_TARGET         ((__force blk_status_t)5)
+#define BLK_STS_NEXUS          ((__force blk_status_t)6)
+#define BLK_STS_MEDIUM         ((__force blk_status_t)7)
+#define BLK_STS_PROTECTION     ((__force blk_status_t)8)
+#define BLK_STS_RESOURCE       ((__force blk_status_t)9)
+#define BLK_STS_IOERR          ((__force blk_status_t)10)
+
+/* hack for device mapper, don't use elsewhere: */
+#define BLK_STS_DM_REQUEUE    ((__force blk_status_t)11)
+
+#define BLK_STS_AGAIN          ((__force blk_status_t)12)
 
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
@@ -22,7 +42,7 @@ typedef void (bio_destructor_t) (struct bio *);
 struct bio {
        struct bio              *bi_next;       /* request queue link */
        struct block_device     *bi_bdev;
-       int                     bi_error;
+       blk_status_t            bi_status;
        unsigned int            bi_opf;         /* bottom bits req flags,
                                                 * top bits REQ_OP. Use
                                                 * accessors.
index f196c70489407c6c52f42144b9ed0490fc1e1a8e..1d5581dc1b918e735bc56e7659775e6e51ca6e42 100644 (file)
@@ -197,5 +197,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
 
 #define capable(cap)           true
 
+int blk_status_to_errno(blk_status_t status);
+blk_status_t errno_to_blk_status(int errno);
+
 #endif /* __TOOLS_LINUX_BLKDEV_H */
 
index 89cdd30dd11d9f8bba2015b49cf6d7a878f334dc..e25568c848bd443569d8e279bffba474eb763812 100644 (file)
@@ -14,7 +14,7 @@
 #define BUG()                  do { assert(0); unreachable(); } while (0)
 #define BUG_ON(cond)           assert(!(cond))
 
-#define WARN_ON_ONCE(cond)     assert(!(cond))
+#define WARN_ON_ONCE(cond)     ({ bool _r = (cond); if (_r) assert(0); _r; })
 #define WARN_ONCE(cond, msg)   ({ bool _r = (cond); if (_r) assert(0); _r; })
 
 #define __WARN()               assert(0)
index 2d9f8291f581bcadcde4d35294757c17e94b5b67..870bdef458ef61434538332b2d7f0f76d292b6d5 100644 (file)
@@ -204,4 +204,19 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);
 
+static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+       /* Avoid division in the common cases 1 ns and 1 s. */
+       if (gran == 1) {
+               /* nothing */
+       } else if (gran == NSEC_PER_SEC) {
+               t.tv_nsec = 0;
+       } else if (gran > 1 && gran < NSEC_PER_SEC) {
+               t.tv_nsec -= t.tv_nsec % gran;
+       } else {
+               WARN(1, "illegal file time granularity: %u", gran);
+       }
+       return t;
+}
+
 #endif /* _LINUX_TIME64_H */
index 2632d21c9b3f43406d273b6800250820b1c46541..480941d64e2a1b0f5dbfb8a4da1ade1fb935b048 100644 (file)
@@ -193,8 +193,7 @@ int bch2_set_acl(struct inode *vinode, struct posix_acl *acl, int type)
                        if (ret < 0)
                                return ret;
                        else {
-                               inode->v.i_ctime =
-                                       current_fs_time(inode->v.i_sb);
+                               inode->v.i_ctime = current_time(&inode->v);
                                mark_inode_dirty(&inode->v);
                                if (ret == 0)
                                        acl = NULL;
index d29d871a1c0a49988b2b97469087df214e7fe8ef..29799df65dbccf24d5bff97df4b4ae5e99f91fee 100644 (file)
@@ -257,7 +257,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
                return;
 
        a = bkey_s_c_to_alloc(k);
-       ca = c->devs[a.k->p.inode];
+       ca = bch_dev_bkey_exists(c, a.k->p.inode);
 
        if (a.k->p.offset >= ca->mi.nbuckets)
                return;
@@ -305,10 +305,12 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
                                bch2_alloc_read_key(c, bkey_i_to_s_c(k));
        }
 
+       mutex_lock(&c->bucket_lock);
        for_each_member_device(ca, c, i) {
                bch2_recalc_min_prio(c, ca, READ);
                bch2_recalc_min_prio(c, ca, WRITE);
        }
+       mutex_unlock(&c->bucket_lock);
 
        return 0;
 }
@@ -368,7 +370,7 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
        if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
                return 0;
 
-       ca = c->devs[pos.inode];
+       ca = bch_dev_bkey_exists(c, pos.inode);
 
        if (pos.offset >= ca->mi.nbuckets)
                return 0;
@@ -461,7 +463,7 @@ static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
 
 /* Bucket heap / gen */
 
-void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
        struct prio_clock *clock = &c->prio_clock[rw];
        struct bucket *g;
@@ -975,7 +977,7 @@ static int bch2_allocator_thread(void *arg)
 
 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
-       struct bch_dev *ca = c->devs[ob->ptr.dev];
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
        spin_lock(&ob->lock);
        bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
@@ -1303,7 +1305,7 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
 
        for (i = wp->nr_ptrs - 1; i >= 0; --i) {
                struct open_bucket *ob = wp->ptrs[i];
-               struct bch_dev *ca = c->devs[ob->ptr.dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
                if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
                        BUG_ON(ca->open_buckets_partial_nr >=
@@ -1331,7 +1333,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
        unsigned i;
 
        writepoint_for_each_ptr(wp, ob, i) {
-               struct bch_dev *ca = c->devs[ob->ptr.dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
                BUG_ON(ptr_stale(ca, &ob->ptr));
        }
@@ -1537,7 +1539,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
 
        for (i = 0; i < wp->nr_ptrs_can_use; i++) {
                struct open_bucket *ob = wp->ptrs[i];
-               struct bch_dev *ca = c->devs[ob->ptr.dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
                struct bch_extent_ptr tmp = ob->ptr;
 
                EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
@@ -1589,7 +1591,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
                ra_pages += bdi->ra_pages;
        }
 
-       c->bdi.ra_pages = ra_pages;
+       bch2_set_ra_pages(c, ra_pages);
 
        /* Find fastest, slowest tiers with devices: */
 
index b679dd16895df9c8a285974313c2c039aa1c8b2c..e25baf56f37fe34b78c15a28438dc67723de5a8a 100644 (file)
@@ -326,9 +326,9 @@ struct io_count {
 struct bch_dev {
        struct kobject          kobj;
        struct percpu_ref       ref;
+       struct completion       ref_completion;
        struct percpu_ref       io_ref;
-       struct completion       stop_complete;
-       struct completion       offline_complete;
+       struct completion       io_ref_completion;
 
        struct bch_fs           *fs;
 
@@ -515,12 +515,11 @@ struct bch_fs {
        struct closure          sb_write;
        struct mutex            sb_lock;
 
-       struct backing_dev_info bdi;
-
        /* BTREE CACHE */
        struct bio_set          btree_read_bio;
 
        struct btree_root       btree_roots[BTREE_ID_NR];
+       bool                    btree_roots_dirty;
        struct mutex            btree_root_lock;
 
        struct btree_cache      btree_cache;
@@ -710,6 +709,14 @@ struct bch_fs {
 #undef BCH_TIME_STAT
 };
 
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+       if (c->vfs_sb)
+               c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
 static inline bool bch2_fs_running(struct bch_fs *c)
 {
        return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
index 2dc9a7e0719c7f27a0eeae5265a745ebb31cbbdf..6e0e0452bbb5529df2a9921489fbf76d2230d41b 100644 (file)
@@ -593,18 +593,24 @@ struct bch_inode_generation {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(inode_generation,        BCH_INODE_GENERATION);
 
-#define BCH_INODE_FIELDS()                             \
-       BCH_INODE_FIELD(bi_atime,       64)             \
-       BCH_INODE_FIELD(bi_ctime,       64)             \
-       BCH_INODE_FIELD(bi_mtime,       64)             \
-       BCH_INODE_FIELD(bi_otime,       64)             \
-       BCH_INODE_FIELD(bi_size,        64)             \
-       BCH_INODE_FIELD(bi_sectors,     64)             \
-       BCH_INODE_FIELD(bi_uid,         32)             \
-       BCH_INODE_FIELD(bi_gid,         32)             \
-       BCH_INODE_FIELD(bi_nlink,       32)             \
-       BCH_INODE_FIELD(bi_generation,  32)             \
-       BCH_INODE_FIELD(bi_dev,         32)
+#define BCH_INODE_FIELDS()                                     \
+       BCH_INODE_FIELD(bi_atime,                       64)     \
+       BCH_INODE_FIELD(bi_ctime,                       64)     \
+       BCH_INODE_FIELD(bi_mtime,                       64)     \
+       BCH_INODE_FIELD(bi_otime,                       64)     \
+       BCH_INODE_FIELD(bi_size,                        64)     \
+       BCH_INODE_FIELD(bi_sectors,                     64)     \
+       BCH_INODE_FIELD(bi_uid,                         32)     \
+       BCH_INODE_FIELD(bi_gid,                         32)     \
+       BCH_INODE_FIELD(bi_nlink,                       32)     \
+       BCH_INODE_FIELD(bi_generation,                  32)     \
+       BCH_INODE_FIELD(bi_dev,                         32)     \
+       BCH_INODE_FIELD(bi_data_checksum,               8)      \
+       BCH_INODE_FIELD(bi_compression,                 8)
+
+#define BCH_INODE_FIELDS_INHERIT()                             \
+       BCH_INODE_FIELD(bi_data_checksum)                       \
+       BCH_INODE_FIELD(bi_compression)
 
 enum {
        /*
@@ -794,7 +800,7 @@ struct bch_sb_layout {
        __u8                    sb_max_size_bits; /* base 2 of 512 byte sectors */
        __u8                    nr_superblocks;
        __u8                    pad[5];
-       __u64                   sb_offset[61];
+       __le64                  sb_offset[61];
 } __attribute__((packed, aligned(8)));
 
 #define BCH_SB_LAYOUT_SECTOR   7
@@ -1089,6 +1095,11 @@ struct jset_entry {
        };
 };
 
+struct jset_entry_blacklist {
+       struct jset_entry       entry;
+       __le64                  seq;
+};
+
 #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
 
 enum {
index 73089a90f4860d64804f000725604f8bbb7c9602..970150848cee4ed643aa76d1279cf0bd6b6c7f72 100644 (file)
@@ -1,6 +1,7 @@
 
 #include "bcachefs.h"
 #include "bkey.h"
+#include "bkey_methods.h"
 #include "bset.h"
 #include "util.h"
 
@@ -80,37 +81,6 @@ static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
                                        const struct bkey_format *format) {}
 #endif
 
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
-       char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
-       p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
-         k->u64s, k->type, k->p.inode, k->p.offset,
-         k->p.snapshot, k->size, k->version.lo);
-
-       BUG_ON(bkey_packed(k));
-
-       switch (k->type) {
-       case KEY_TYPE_DELETED:
-               p(" deleted");
-               break;
-       case KEY_TYPE_DISCARD:
-               p(" discard");
-               break;
-       case KEY_TYPE_ERROR:
-               p(" error");
-               break;
-       case KEY_TYPE_COOKIE:
-               p(" cookie");
-               break;
-       }
-#undef p
-
-       return out - buf;
-}
-
 struct pack_state {
        const struct bkey_format *format;
        unsigned                bits;   /* bits remaining in current word */
@@ -336,7 +306,8 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
         * Extents - we have to guarantee that if an extent is packed, a trimmed
         * version will also pack:
         */
-       if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+       if (bkey_start_offset(in) <
+           le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
                return false;
 
        pack_state_finish(&state, out);
@@ -800,7 +771,7 @@ static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
                              bool *eax_zeroed)
 {
        unsigned bits = format->bits_per_field[field];
-       u64 offset = format->field_offset[field];
+       u64 offset = le64_to_cpu(format->field_offset[field]);
        unsigned i, byte, bit_offset, align, shl, shr;
 
        if (!bits && !offset) {
index dc0b88f75ebe4a4934988ef465afeef131a106d4..896979565c5b1a95174295b1e1d2d3560972d565 100644 (file)
@@ -8,7 +8,6 @@
 #include "vstructs.h"
 
 void bch2_to_binary(char *, const u64 *, unsigned);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
 
 #define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
 
@@ -377,7 +376,8 @@ static inline u64 bkey_field_max(const struct bkey_format *f,
                                 enum bch_bkey_fields nr)
 {
        return f->bits_per_field[nr] < 64
-               ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+               ? (le64_to_cpu(f->field_offset[nr]) +
+                  ~(~0ULL << f->bits_per_field[nr]))
                : U64_MAX;
 }
 
index 23894158d62087106fdf394d7e71605c0bab726d..1736a483b7cfd2102cd5a371bc5a0939768d58da 100644 (file)
@@ -18,28 +18,11 @@ const struct bkey_ops *bch2_bkey_ops[] = {
        [BKEY_TYPE_BTREE]       = &bch2_bkey_btree_ops,
 };
 
-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
-                        struct bkey_s_c k)
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+                                 struct bkey_s_c k)
 {
        const struct bkey_ops *ops = bch2_bkey_ops[type];
 
-       if (k.k->u64s < BKEY_U64s)
-               return "u64s too small";
-
-       if (!ops->is_extents) {
-               if (k.k->size)
-                       return "nonzero size field";
-       } else {
-               if ((k.k->size == 0) != bkey_deleted(k.k))
-                       return "bad size field";
-       }
-
-       if (ops->is_extents &&
-           !k.k->size &&
-           !bkey_deleted(k.k))
-               return "zero size field";
-
        switch (k.k->type) {
        case KEY_TYPE_DELETED:
        case KEY_TYPE_DISCARD:
@@ -63,8 +46,41 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
        }
 }
 
-const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
-                                   struct bkey_s_c k)
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+                             struct bkey_s_c k)
+{
+       const struct bkey_ops *ops = bch2_bkey_ops[type];
+
+       if (k.k->u64s < BKEY_U64s)
+               return "u64s too small";
+
+       if (!ops->is_extents) {
+               if (k.k->size)
+                       return "nonzero size field";
+       } else {
+               if ((k.k->size == 0) != bkey_deleted(k.k))
+                       return "bad size field";
+       }
+
+       if (ops->is_extents &&
+           !k.k->size &&
+           !bkey_deleted(k.k))
+               return "zero size field";
+
+       if (k.k->p.snapshot)
+               return "nonzero snapshot";
+
+       return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+                             struct bkey_s_c k)
+{
+       return __bch2_bkey_invalid(c, type, k) ?:
+               bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
        if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
                return "key before start of btree node";
@@ -72,10 +88,7 @@ const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
        if (bkey_cmp(k.k->p, b->data->max_key) > 0)
                return "key past end of btree node";
 
-       if (k.k->p.snapshot)
-               return "nonzero snapshot";
-
-       return bch2_bkey_invalid(c, btree_node_type(b), k);
+       return NULL;
 }
 
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
@@ -86,7 +99,8 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 
        BUG_ON(!k.k->u64s);
 
-       invalid = bch2_btree_bkey_invalid(c, b, k);
+       invalid = bch2_bkey_invalid(c, type, k) ?:
+               bch2_bkey_in_btree_node(b, k);
        if (invalid) {
                char buf[160];
 
@@ -100,33 +114,62 @@ void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
                ops->key_debugcheck(c, b, k);
 }
 
-char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
-                      char *buf, size_t size, struct bkey_s_c k)
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 {
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       char *out = buf, *end = buf + size;
 
-       if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-           ops->val_to_text)
-               ops->val_to_text(c, buf, size, k);
+       p("u64s %u type %u ", k->u64s, k->type);
+
+       if (bkey_cmp(k->p, POS_MAX))
+               p("%llu:%llu", k->p.inode, k->p.offset);
+       else
+               p("POS_MAX");
 
-       return buf;
+       p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+       return out - buf;
 }
 
-char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
-                           char *buf, size_t size, struct bkey_s_c k)
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+                    char *buf, size_t size, struct bkey_s_c k)
 {
        const struct bkey_ops *ops = bch2_bkey_ops[type];
        char *out = buf, *end = buf + size;
 
-       out += bch2_bkey_to_text(out, end - out, k.k);
-
-       if (k.k->type >= KEY_TYPE_GENERIC_NR &&
-           ops->val_to_text) {
-               out += scnprintf(out, end - out, ": ");
-               ops->val_to_text(c, out, end - out, k);
+       switch (k.k->type) {
+       case KEY_TYPE_DELETED:
+               p(" deleted");
+               break;
+       case KEY_TYPE_DISCARD:
+               p(" discard");
+               break;
+       case KEY_TYPE_ERROR:
+               p(" error");
+               break;
+       case KEY_TYPE_COOKIE:
+               p(" cookie");
+               break;
+       default:
+               if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+                       ops->val_to_text(c, buf, size, k);
+               break;
        }
 
-       return buf;
+       return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+                         char *buf, size_t size, struct bkey_s_c k)
+{
+       char *out = buf, *end = buf + size;
+
+       out += bch2_bkey_to_text(out, end - out, k.k);
+       out += scnprintf(out, end - out, ": ");
+       out += bch2_val_to_text(c, type, out, end - out, k);
+
+       return out - buf;
 }
 
 void bch2_bkey_swab(enum bkey_type type,
index 29c1abd383cb1544c7336e80b870d6151de558c8..59db3037e6dde1fdafdc3991d2b3a53921ad08d4 100644 (file)
@@ -64,15 +64,19 @@ struct bkey_ops {
        bool            is_extents;
 };
 
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+                                 struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
 const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
-                                   struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
 void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
-                      char *, size_t, struct bkey_s_c);
-char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
-                           char *, size_t, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+                    char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+                         char *, size_t, struct bkey_s_c);
 
 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
                    struct bkey_packed *);
index 1198fe39c10038b380170f9054cc12feb013c228..2294cc3adeca80902aede25b74ed8eb1ccdb0353 100644 (file)
@@ -96,7 +96,7 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 
                extent_for_each_ptr(e, ptr) {
-                       struct bch_dev *ca = c->devs[ptr->dev];
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
                        size_t b = PTR_BUCKET_NR(ca, ptr);
 
                        if (gen_after(ca->oldest_gens[b], ptr->gen))
@@ -159,14 +159,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
                    (!c->opts.nofsck &&
                     fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
-                                "superblock not marked as containing replicas"))) {
+                                "superblock not marked as containing replicas (type %u)",
+                                data_type))) {
                        ret = bch2_check_mark_super(c, e, data_type);
                        if (ret)
                                return ret;
                }
 
                extent_for_each_ptr(e, ptr) {
-                       struct bch_dev *ca = c->devs[ptr->dev];
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
                        struct bucket *g = PTR_BUCKET(ca, ptr);
 
                        if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
@@ -315,14 +316,14 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
        lockdep_assert_held(&c->sb_lock);
 
        for (i = 0; i < layout->nr_superblocks; i++) {
-               if (layout->sb_offset[i] == BCH_SB_SECTOR)
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+               if (offset == BCH_SB_SECTOR)
                        mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
                                              BUCKET_SB, flags);
 
-               mark_metadata_sectors(c, ca,
-                                     layout->sb_offset[i],
-                                     layout->sb_offset[i] +
-                                     (1 << layout->sb_max_size_bits),
+               mark_metadata_sectors(c, ca, offset,
+                                     offset + (1 << layout->sb_max_size_bits),
                                      BUCKET_SB, flags);
        }
 
@@ -414,7 +415,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                spin_lock(&ob->lock);
                if (ob->valid) {
                        gc_pos_set(c, gc_pos_alloc(c, ob));
-                       ca = c->devs[ob->ptr.dev];
+                       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
                        bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
                                               gc_pos_alloc(c, ob),
                                               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
@@ -424,7 +425,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
        }
 }
 
-void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
        struct bucket *g;
index 38c373c69d20c63a1a48367212c7eb63918c42dd..87a8ddf9215eec33ab92c49827959fc8f033b2cc 100644 (file)
@@ -556,7 +556,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        struct bset_tree *t;
        struct bset *start_bset = bset(b, &b->set[start_idx]);
        bool used_mempool = false;
-       u64 start_time;
+       u64 start_time, seq = 0;
        unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
        bool sorting_entire_node = start_idx == 0 &&
                end_idx == b->nsets;
@@ -595,12 +595,9 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
                bch2_time_stats_update(&c->btree_sort_time, start_time);
 
        /* Make sure we preserve bset journal_seq: */
-       for (t = b->set + start_idx + 1;
-            t < b->set + end_idx;
-            t++)
-               start_bset->journal_seq =
-                       max(start_bset->journal_seq,
-                           bset(b, t)->journal_seq);
+       for (t = b->set + start_idx; t < b->set + end_idx; t++)
+               seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+       start_bset->journal_seq = cpu_to_le64(seq);
 
        if (sorting_entire_node) {
                unsigned u64s = le16_to_cpu(out->keys.u64s);
@@ -958,6 +955,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 {
        struct bkey_packed *k, *prev = NULL;
        struct bpos prev_pos = POS_MIN;
+       enum bkey_type type = btree_node_type(b);
        bool seen_non_whiteout = false;
        const char *err;
        int ret = 0;
@@ -1025,7 +1023,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
        if (!BSET_SEPARATE_WHITEOUTS(i)) {
                seen_non_whiteout = true;
-               whiteout_u64s = 0;
+               *whiteout_u64s = 0;
        }
 
        for (k = i->start;
@@ -1059,16 +1057,17 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                }
 
                if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
-                       bch2_bkey_swab(btree_node_type(b), &b->format, k);
+                       bch2_bkey_swab(type, &b->format, k);
 
                u = bkey_disassemble(b, k, &tmp);
 
-               invalid = bch2_btree_bkey_invalid(c, b, u);
+               invalid = __bch2_bkey_invalid(c, type, u) ?:
+                       bch2_bkey_in_btree_node(b, u) ?:
+                       (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
                if (invalid) {
                        char buf[160];
 
-                       bch2_bkey_val_to_text(c, btree_node_type(b),
-                                             buf, sizeof(buf), u);
+                       bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
                        btree_err(BTREE_ERR_FIXABLE, c, b, i,
                                  "invalid bkey %s: %s", buf, invalid);
 
@@ -1114,6 +1113,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
        struct btree_node_entry *bne;
        struct btree_node_iter *iter;
        struct btree_node *sorted;
+       struct bkey_packed *k;
+       struct bset *i;
        bool used_mempool;
        unsigned u64s;
        int ret, retry_read = 0, write = READ;
@@ -1137,7 +1138,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                unsigned sectors, whiteout_u64s = 0;
                struct nonce nonce;
                struct bch_csum csum;
-               struct bset *i;
 
                if (!b->written) {
                        i = &b->data->keys;
@@ -1238,6 +1238,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
 
        btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
 
+       i = &b->data->keys;
+       for (k = i->start; k != vstruct_last(i);) {
+               enum bkey_type type = btree_node_type(b);
+               struct bkey tmp;
+               struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+               const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+               if (invalid) {
+                       char buf[160];
+
+                       bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+                       btree_err(BTREE_ERR_FIXABLE, c, b, i,
+                                 "invalid bkey %s: %s", buf, invalid);
+
+                       btree_keys_account_key_drop(&b->nr, 0, k);
+
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+                       memmove_u64s_down(k, bkey_next(k),
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       continue;
+               }
+
+               k = bkey_next(k);
+       }
+
        bch2_bset_build_aux_tree(b, b->set, false);
 
        set_needs_whiteout(btree_bset_first(b));
@@ -1278,13 +1303,13 @@ static void btree_node_read_work(struct work_struct *work)
                bio->bi_iter.bi_size    = btree_bytes(c);
                submit_bio_wait(bio);
 start:
-               bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+               bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
                percpu_ref_put(&rb->pick.ca->io_ref);
 
                __set_bit(rb->pick.ca->dev_idx, avoid.d);
                rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
 
-               if (!bio->bi_error &&
+               if (!bio->bi_status &&
                    !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
                        goto out;
        } while (!IS_ERR_OR_NULL(rb->pick.ca));
@@ -1377,17 +1402,24 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
        BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
 
        bch2_btree_node_read(c, b, true);
-       six_unlock_write(&b->lock);
 
        if (btree_node_read_error(b)) {
-               six_unlock_intent(&b->lock);
-               return -EIO;
+               bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+               mutex_lock(&c->btree_cache.lock);
+               list_move(&b->list, &c->btree_cache.freeable);
+               mutex_unlock(&c->btree_cache.lock);
+
+               ret = -EIO;
+               goto err;
        }
 
        bch2_btree_set_root_for_read(c, b);
+err:
+       six_unlock_write(&b->lock);
        six_unlock_intent(&b->lock);
 
-       return 0;
+       return ret;
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1412,35 +1444,57 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
        struct closure *cl      = wbio->cl;
        __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
        struct bkey_i_extent *new_key;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct btree_iter iter;
+       int ret;
 
-       six_lock_read(&b->lock);
-       bkey_copy(&tmp.k, &b->key);
-       six_unlock_read(&b->lock);
+       __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+                              BTREE_MAX_DEPTH,
+                              b->level, 0);
+retry:
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto err;
 
-       if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
-               /* Node has been freed: */
+       /* has node been freed? */
+       if (iter.nodes[b->level] != b) {
+               /* node has been freed: */
+               if (!btree_node_dying(b))
+                       panic("foo4\n");
                goto out;
        }
 
-       new_key = bkey_i_to_extent(&tmp.k);
+       if (!btree_node_hashed(b))
+               panic("foo5\n");
 
-       while (wbio->replicas_failed) {
-               unsigned idx = __fls(wbio->replicas_failed);
+       bkey_copy(&tmp.k, &b->key);
 
-               bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
-               wbio->replicas_failed ^= 1 << idx;
-       }
+       new_key = bkey_i_to_extent(&tmp.k);
+       e = extent_i_to_s(new_key);
+       extent_for_each_ptr_backwards(e, ptr)
+               if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+                       bch2_extent_drop_ptr(e, ptr);
 
-       if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
-           bch2_btree_node_update_key(c, b, new_key)) {
-               set_btree_node_noevict(b);
-               bch2_fatal_error(c);
-       }
+       if (!bch2_extent_nr_ptrs(e.c))
+               goto err;
+
+       ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+       if (ret == -EINTR)
+               goto retry;
+       if (ret)
+               goto err;
 out:
+       bch2_btree_iter_unlock(&iter);
        bio_put(&wbio->bio);
        btree_node_write_done(c, b);
        if (cl)
                closure_put(cl);
+       return;
+err:
+       set_btree_node_noevict(b);
+       bch2_fs_fatal_error(c, "fatal error writing btree node");
+       goto out;
 }
 
 void bch2_btree_write_error_work(struct work_struct *work)
@@ -1470,12 +1524,17 @@ static void btree_node_write_endio(struct bio *bio)
        struct closure *cl              = !wbio->split ? wbio->cl : NULL;
        struct bch_fs *c                = wbio->c;
        struct bch_dev *ca              = wbio->ca;
+       unsigned long flags;
 
        bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
 
-       if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
-           bch2_meta_write_fault("btree"))
-               set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+       if (bio->bi_status == BLK_STS_REMOVED ||
+           bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+           bch2_meta_write_fault("btree")) {
+               spin_lock_irqsave(&c->btree_write_error_lock, flags);
+               bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+       }
 
        if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
@@ -1491,12 +1550,11 @@ static void btree_node_write_endio(struct bio *bio)
                wbio->used_mempool,
                wbio->data);
 
-       if (wbio->replicas_failed) {
-               unsigned long flags;
-
+       if (wbio->failed.nr) {
                spin_lock_irqsave(&c->btree_write_error_lock, flags);
                bio_list_add(&c->btree_write_error_list, &wbio->bio);
                spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
                queue_work(c->wq, &c->btree_write_error_work);
                return;
        }
@@ -1707,6 +1765,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
        wbio->cl                = parent;
+       wbio->failed.nr         = 0;
        wbio->order             = order;
        wbio->used_mempool      = used_mempool;
        wbio->data              = data;
index b0e64957d4930f4b6339c98cdac83d2944fe6c87..0b505a738e86ed3f2fc7696ad3e27e14d3b2a982 100644 (file)
@@ -75,8 +75,8 @@ bool bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
 {
        struct btree_iter *linked;
        struct btree *b = iter->nodes[level];
-       enum btree_node_locked_type want = btree_lock_want(iter, level);
-       enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+       int want = btree_lock_want(iter, level);
+       int have = btree_node_locked_type(iter, level);
 
        if (want == have)
                return true;
@@ -108,6 +108,17 @@ success:
        return true;
 }
 
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+       unsigned l;
+
+       for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+               if (!bch2_btree_node_relock(iter, l))
+                       return false;
+
+       return true;
+}
+
 /* Slowpath: */
 bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                           unsigned level,
@@ -214,7 +225,6 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
                                     unsigned new_locks_want)
 {
        struct btree_iter *linked;
-       unsigned l;
 
        /* Drop locks we don't want anymore: */
        if (new_locks_want < iter->locks_want)
@@ -228,12 +238,9 @@ bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
        iter->locks_want = new_locks_want;
        btree_iter_drop_extra_locks(iter);
 
-       for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
-               if (!bch2_btree_node_relock(iter, l))
-                       goto fail;
+       if (bch2_btree_iter_relock(iter))
+               return true;
 
-       return true;
-fail:
        /*
         * Just an optimization: ancestor nodes must be locked before child
         * nodes, so set locks_want on iterators that might lock ancestors
index c2711892d7f3aace97bc6bc01e1a296ddfca1f3e..acfe5b59df56bd9242a0e2a217fd74d67800c368 100644 (file)
@@ -75,7 +75,7 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
        mark_btree_node_locked(iter, level, SIX_LOCK_intent);
 }
 
-static inline int btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
 {
        return level < iter->locks_want
                ? SIX_LOCK_intent
@@ -111,6 +111,7 @@ static inline bool btree_node_lock(struct btree *b, struct bpos pos,
 }
 
 bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
 
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
 void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
index f1e06a378c9a82fe2cf554951db8193693121bc1..f0e6896a8a5e4bb2216d58ef8d5ea0e82c4a92ab 100644 (file)
@@ -196,6 +196,7 @@ enum btree_flags {
        BTREE_NODE_accessed,
        BTREE_NODE_write_in_flight,
        BTREE_NODE_just_written,
+       BTREE_NODE_dying,
 };
 
 BTREE_FLAG(read_in_flight);
@@ -207,6 +208,7 @@ BTREE_FLAG(write_idx);
 BTREE_FLAG(accessed);
 BTREE_FLAG(write_in_flight);
 BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
index e11fcec963ba4e923831dbafc8dff81ce068af78..c7c2930650d3f91da560187f2cd8941a522f7670 100644 (file)
@@ -130,7 +130,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
                            __le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
-                              struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+                              struct btree *, struct bkey_i_extent *);
 
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index 1fe8fff8eb07f9b313cc26df8cbd8b6a8b127182..04854532b8b48627c1731a602259cb39fd1942b9 100644 (file)
@@ -21,7 +21,7 @@
 static void btree_node_will_make_reachable(struct btree_update *,
                                           struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
 
 /* Debug code: */
 
@@ -686,7 +686,7 @@ retry:
                BUG_ON(c->btree_roots[b->btree_id].as != as);
                c->btree_roots[b->btree_id].as = NULL;
 
-               bch2_btree_set_root_ondisk(c, b);
+               bch2_btree_set_root_ondisk(c, b, WRITE);
 
                /*
                 * We don't have to wait anything anything here (before
@@ -914,6 +914,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        struct btree_write *w;
        struct bset_tree *t;
 
+       set_btree_node_dying(b);
        btree_interior_update_add_node_reference(as, b);
 
        /*
@@ -925,7 +926,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         * in with keys that aren't in the journal anymore:
         */
        for_each_bset(b, t)
-               as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+               as->journal_seq = max(as->journal_seq,
+                                     le64_to_cpu(bset(b, t)->journal_seq));
 
        mutex_lock(&c->btree_interior_update_lock);
 
@@ -1027,6 +1029,10 @@ static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        mutex_unlock(&c->btree_cache.lock);
 
        mutex_lock(&c->btree_root_lock);
+       BUG_ON(btree_node_root(c, b) &&
+              (b->level < btree_node_root(c, b)->level ||
+               !btree_node_dying(btree_node_root(c, b))));
+
        btree_node_root(c, b) = b;
        mutex_unlock(&c->btree_root_lock);
 
@@ -1054,7 +1060,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
                            gc_pos_btree_root(b->btree_id));
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
 {
        struct btree_root *r = &c->btree_roots[b->btree_id];
 
@@ -1064,6 +1070,8 @@ static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
        bkey_copy(&r->key, &b->key);
        r->level = b->level;
        r->alive = true;
+       if (rw == WRITE)
+               c->btree_roots_dirty = true;
 
        mutex_unlock(&c->btree_root_lock);
 }
@@ -1787,64 +1795,16 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        return ret;
 }
 
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
-                              struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+                                        struct btree_update *as,
+                                        struct btree_iter *iter,
+                                        struct btree *b, struct btree *new_hash,
+                                        struct bkey_i_extent *new_key)
 {
-       struct btree_update *as = NULL;
-       struct btree *parent, *new_hash = NULL;
-       struct btree_iter iter;
-       struct closure cl;
+       struct btree *parent;
        bool must_rewrite_parent = false;
        int ret;
 
-       __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
-                              BTREE_MAX_DEPTH,
-                              b->level, 0);
-       closure_init_stack(&cl);
-
-       ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
-       if (ret)
-               return ret;
-
-retry:
-       down_read(&c->gc_lock);
-       ret = bch2_btree_iter_traverse(&iter);
-       if (ret)
-               goto err;
-
-       /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
-       if (!new_hash &&
-           PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
-               /* bch2_btree_reserve_get will unlock */
-               do {
-                       ret = bch2_btree_cache_cannibalize_lock(c, &cl);
-                       closure_sync(&cl);
-               } while (ret == -EAGAIN);
-
-               BUG_ON(ret);
-
-               new_hash = bch2_btree_node_mem_alloc(c);
-       }
-
-       as = bch2_btree_update_start(c, iter.btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    BTREE_INSERT_NOFAIL|
-                                    BTREE_INSERT_USE_RESERVE|
-                                    BTREE_INSERT_USE_ALLOC_RESERVE,
-                                    &cl);
-       if (IS_ERR(as)) {
-               ret = PTR_ERR(as);
-               if (ret == -EAGAIN || ret == -EINTR) {
-                       bch2_btree_iter_unlock(&iter);
-                       up_read(&c->gc_lock);
-                       closure_sync(&cl);
-                       goto retry;
-               }
-               goto err;
-       }
-
-       mutex_lock(&c->btree_interior_update_lock);
-
        /*
         * Two corner cases that need to be thought about here:
         *
@@ -1869,22 +1829,12 @@ retry:
        if (b->will_make_reachable)
                must_rewrite_parent = true;
 
-       /* other case: btree node being freed */
-       if (iter.nodes[b->level] != b) {
-               /* node has been freed: */
-               BUG_ON(btree_node_hashed(b));
-               mutex_unlock(&c->btree_interior_update_lock);
-               goto err;
-       }
-
-       mutex_unlock(&c->btree_interior_update_lock);
-
        if (must_rewrite_parent)
                as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
 
        btree_interior_update_add_node_reference(as, b);
 
-       parent = iter.nodes[b->level + 1];
+       parent = iter->nodes[b->level + 1];
        if (parent) {
                if (new_hash) {
                        bkey_copy(&new_hash->key, &new_key->k_i);
@@ -1893,8 +1843,8 @@ retry:
                        BUG_ON(ret);
                }
 
-               bch2_btree_insert_node(as, parent, &iter,
-                                      &keylist_single(&new_key->k_i));
+               bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
 
                if (new_hash) {
                        mutex_lock(&c->btree_cache.lock);
@@ -1914,7 +1864,7 @@ retry:
 
                BUG_ON(btree_node_root(c, b) != b);
 
-               bch2_btree_node_lock_write(b, &iter);
+               bch2_btree_node_lock_write(b, iter);
 
                bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
                              c->opts.btree_node_size, true,
@@ -1925,14 +1875,94 @@ retry:
                                           &stats);
                bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
                                    gc_pos_btree_root(b->btree_id));
-               bkey_copy(&b->key, &new_key->k_i);
+
+               if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+                       bkey_copy(&b->key, &new_key->k_i);
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+               } else {
+                       bkey_copy(&b->key, &new_key->k_i);
+               }
 
                btree_update_updated_root(as);
-               bch2_btree_node_unlock_write(b, &iter);
+               bch2_btree_node_unlock_write(b, iter);
        }
 
        bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+                              struct btree *b, struct bkey_i_extent *new_key)
+{
+       struct btree_update *as = NULL;
+       struct btree *new_hash = NULL;
+       struct closure cl;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       if (!down_read_trylock(&c->gc_lock)) {
+               bch2_btree_iter_unlock(iter);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+       }
+
+       /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+       if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+               /* bch2_btree_reserve_get will unlock */
+               ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+               if (ret) {
+                       ret = -EINTR;
+
+                       bch2_btree_iter_unlock(iter);
+                       up_read(&c->gc_lock);
+                       closure_sync(&cl);
+                       down_read(&c->gc_lock);
+
+                       if (!bch2_btree_iter_relock(iter))
+                               goto err;
+               }
+
+               new_hash = bch2_btree_node_mem_alloc(c);
+       }
+
+       as = bch2_btree_update_start(c, iter->btree_id,
+                                    btree_update_reserve_required(c, b),
+                                    BTREE_INSERT_NOFAIL|
+                                    BTREE_INSERT_USE_RESERVE|
+                                    BTREE_INSERT_USE_ALLOC_RESERVE,
+                                    &cl);
+       if (IS_ERR(as)) {
+               ret = PTR_ERR(as);
+               if (ret == -EAGAIN)
+                       ret = -EINTR;
+
+               if (ret != -EINTR)
+                       goto err;
+
+               bch2_btree_iter_unlock(iter);
+               up_read(&c->gc_lock);
+               closure_sync(&cl);
+               down_read(&c->gc_lock);
+
+               if (!bch2_btree_iter_relock(iter))
+                       goto err;
+       }
+
+       ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+       if (ret)
+               goto err_free_update;
+
+       __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
                list_move(&new_hash->list, &c->btree_cache.freeable);
@@ -1941,14 +1971,12 @@ out:
                six_unlock_write(&new_hash->lock);
                six_unlock_intent(&new_hash->lock);
        }
-       bch2_btree_iter_unlock(&iter);
        up_read(&c->gc_lock);
        closure_sync(&cl);
        return ret;
-err:
-       if (as)
-               bch2_btree_update_free(as);
-       goto out;
+err_free_update:
+       bch2_btree_update_free(as);
+       goto err;
 }
 
 /* Init code: */
@@ -1962,7 +1990,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
        BUG_ON(btree_node_root(c, b));
 
        __bch2_btree_set_root_inmem(c, b);
-       bch2_btree_set_root_ondisk(c, b);
+       bch2_btree_set_root_ondisk(c, b, READ);
 }
 
 int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
@@ -1998,7 +2026,7 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
        BUG_ON(btree_node_root(c, b));
 
        bch2_btree_set_root_inmem(as, b);
-       bch2_btree_set_root_ondisk(c, b);
+       bch2_btree_set_root_ondisk(c, b, WRITE);
 
        bch2_btree_open_bucket_put(c, b);
        six_unlock_intent(&b->lock);
index b73002def9fcb6037142d01733fcae9855143b51..f0a63232093aff48ccb18f1996d84685ed9681d4 100644 (file)
@@ -174,9 +174,11 @@ do {                                                                       \
 
 #define bch2_usage_read_raw(_stats)                                    \
 ({                                                                     \
-       typeof(*this_cpu_ptr(_stats)) _acc = { 0 };                     \
+       typeof(*this_cpu_ptr(_stats)) _acc;                             \
        int cpu;                                                        \
                                                                        \
+       memset(&_acc, 0, sizeof(_acc));                                 \
+                                                                       \
        for_each_possible_cpu(cpu)                                      \
                bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu));      \
                                                                        \
@@ -479,7 +481,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 {
        struct bucket_mark old, new;
        unsigned saturated;
-       struct bch_dev *ca = c->devs[ptr->dev];
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
        unsigned data_type = type == S_META
                ? BUCKET_BTREE : BUCKET_DATA;
index 0bd8d2d8f5bc799108258c80bd828cb364282719..6f9b12265df3f2f92416289c64492fb218130730 100644 (file)
@@ -68,16 +68,14 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
-
        /* _uncompressed_ sectors: */
+       u64                     online_reserved;
+       u64                     available_cache;
 
        struct {
                u64             data[S_ALLOC_NR];
                u64             persistent_reserved;
        }                       s[BCH_REPLICAS_MAX];
-
-       u64                     online_reserved;
-       u64                     available_cache;
 };
 
 /*
index d9a3212c7e096f3dd78a9a3677a8140dc4f5a643..24af2ca1620e1992185fb91c5f96bb4318e07b83 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bcachefs_ioctl.h"
+#include "chardev.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -25,7 +26,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
                        return ERR_PTR(-EINVAL);
 
                rcu_read_lock();
-               ca = c->devs[dev];
+               ca = rcu_dereference(c->devs[dev]);
                if (ca)
                        percpu_ref_get(&ca->ref);
                rcu_read_unlock();
@@ -80,7 +81,7 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 
        devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
 
-       if (copy_from_user(user_devs, arg.devs,
+       if (copy_from_user(user_devs, user_arg->devs,
                           sizeof(u64) * arg.nr_devs))
                goto err;
 
index 1a0894179c47c91fac7021152eb73eff8bcec4cc..b0c8a50e7c135000df5f0ce76a77feb63a95d65a 100644 (file)
@@ -72,14 +72,15 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
        }
 }
 
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+                                                        unsigned opt)
 {
        if (c->sb.encryption_type)
                return c->opts.wide_macs
                        ? BCH_CSUM_CHACHA20_POLY1305_128
                        : BCH_CSUM_CHACHA20_POLY1305_80;
 
-       return bch2_csum_opt_to_type(c->opts.data_checksum, true);
+       return bch2_csum_opt_to_type(opt, true);
 }
 
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -143,6 +144,14 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
        return nonce;
 }
 
+static inline struct nonce null_nonce(void)
+{
+       struct nonce ret;
+
+       memset(&ret, 0, sizeof(ret));
+       return ret;
+}
+
 static inline struct nonce extent_nonce(struct bversion version,
                                        struct bch_extent_crc_unpacked crc)
 {
index 8357c8de895d28f692a9b81d160334275853d1ad..ca2a06e2e4eb3dbd751aa24ecb4c1622cc2bb594 100644 (file)
@@ -95,11 +95,17 @@ print:
        vscnprintf(buf, sizeof(_buf), fmt, args);
        va_end(args);
 
+       if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+               bch_err(c, "%s, exiting", buf);
+               mutex_unlock(&c->fsck_error_lock);
+               return FSCK_ERR_EXIT;
+       }
+
        if (flags & FSCK_CAN_FIX) {
-               if (c->opts.fix_errors == FSCK_ERR_ASK) {
+               if (c->opts.fix_errors == FSCK_OPT_ASK) {
                        printk(KERN_ERR "%s: fix?", buf);
                        fix = ask_yn();
-               } else if (c->opts.fix_errors == FSCK_ERR_YES ||
+               } else if (c->opts.fix_errors == FSCK_OPT_YES ||
                           (c->opts.nochanges &&
                            !(flags & FSCK_CAN_IGNORE))) {
                        if (print)
index 68635eee3d85b1e75bc3175143172daaf3d748fb..28fe4fceb93a83ba918df360e09894a270eaef03 100644 (file)
@@ -96,9 +96,10 @@ enum {
 };
 
 enum fsck_err_opts {
-       FSCK_ERR_NO,
-       FSCK_ERR_YES,
-       FSCK_ERR_ASK,
+       FSCK_OPT_EXIT,
+       FSCK_OPT_YES,
+       FSCK_OPT_NO,
+       FSCK_OPT_ASK,
 };
 
 enum fsck_err_ret {
@@ -217,7 +218,7 @@ do {                                                                        \
 #define bcache_io_error(c, bio, fmt, ...)                              \
 do {                                                                   \
        __bcache_io_error(c, fmt, ##__VA_ARGS__);                       \
-       (bio)->bi_error = -EIO;                                         \
+       (bio)->bi_status = BLK_STS_IOERR;                                       \
 } while (0)
 
 #endif /* _BCACHEFS_ERROR_H */
index 6e79f491a7465d499a32efc48f25d3672b5fc252..176978ca22319b653e7bee8928f3c1efa7b88ad7 100644 (file)
@@ -18,6 +18,7 @@
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "super.h"
 #include "super-io.h"
 #include "util.h"
 #include "xattr.h"
@@ -156,6 +157,19 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
        return nr_ptrs;
 }
 
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned nr_ptrs = 0;
+
+       extent_for_each_ptr(e, ptr)
+               nr_ptrs += (!ptr->cached &&
+                           bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+                           BCH_MEMBER_STATE_FAILED);
+
+       return nr_ptrs;
+}
+
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
        struct bkey_s_c_extent e;
@@ -362,7 +376,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
                            struct bkey_s_c_extent e,
                            const struct bch_extent_ptr *ptr)
 {
-       return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+       return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
 }
 
 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@@ -411,8 +425,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
                                entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
                                break;
                        case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
-                               entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+                               entry->crc128.csum.hi = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.hi);
+                               entry->crc128.csum.lo = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.lo);
                                break;
                        case BCH_EXTENT_ENTRY_ptr:
                                break;
@@ -432,10 +448,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
        const struct bch_extent_ptr *ptr2;
        struct bch_dev *ca;
 
-       if (ptr->dev >= c->sb.nr_devices)
+       if (ptr->dev >= c->sb.nr_devices ||
+           !c->devs[ptr->dev])
                return "pointer to invalid device";
 
-       ca = c->devs[ptr->dev];
+       ca = bch_dev_bkey_exists(c, ptr->dev);
        if (!ca)
                return "pointer to invalid device";
 
@@ -487,7 +504,9 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
                        break;
                case BCH_EXTENT_ENTRY_ptr:
                        ptr = entry_to_ptr(entry);
-                       ca = c->devs[ptr->dev];
+                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                               ? bch_dev_bkey_exists(c, ptr->dev)
+                               : NULL;
 
                        p("ptr: %u:%llu gen %u%s", ptr->dev,
                          (u64) ptr->offset, ptr->gen,
@@ -528,7 +547,7 @@ static void extent_pick_read_device(struct bch_fs *c,
        struct bch_extent_crc_unpacked crc;
 
        extent_for_each_ptr_crc(e, ptr, crc) {
-               struct bch_dev *ca = c->devs[ptr->dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
                if (ptr->cached && ptr_stale(ca, ptr))
                        continue;
@@ -621,7 +640,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
        bool bad;
 
        extent_for_each_ptr(e, ptr) {
-               ca = c->devs[ptr->dev];
+               ca = bch_dev_bkey_exists(c, ptr->dev);
                g = PTR_BUCKET(ca, ptr);
                replicas++;
 
@@ -1730,7 +1749,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
        memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
 
        extent_for_each_ptr(e, ptr) {
-               ca = c->devs[ptr->dev];
+               ca = bch_dev_bkey_exists(c, ptr->dev);
                g = PTR_BUCKET(ca, ptr);
                replicas++;
                ptrs_per_tier[ca->mi.tier]++;
@@ -1844,7 +1863,7 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
 static unsigned PTR_TIER(struct bch_fs *c,
                         const struct bch_extent_ptr *ptr)
 {
-       return c->devs[ptr->dev]->mi.tier;
+       return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
@@ -1971,14 +1990,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                                      struct bkey_s_extent e)
 {
        struct bch_extent_ptr *ptr;
-       unsigned tier = 0, nr_cached = 0, nr_good = 0;
+       unsigned tier = 0, nr_cached = 0;
+       unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
        bool have_higher_tier;
 
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached &&
-                   c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
-                       nr_good++;
-
        if (nr_good <= c->opts.data_replicas)
                return;
 
@@ -2103,7 +2118,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
                                return BCH_MERGE_NOMERGE;
 
                        /* We don't allow extents to straddle buckets: */
-                       ca = c->devs[lp->dev];
+                       ca = bch_dev_bkey_exists(c, lp->dev);
 
                        if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
                                return BCH_MERGE_NOMERGE;
@@ -2347,6 +2362,30 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
        }
 }
 
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+       struct btree_iter iter;
+       struct bpos end = pos;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       end.offset += size;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+                            BTREE_ITER_WITH_HOLES, k) {
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
+
+               if (!bch2_extent_is_fully_allocated(k)) {
+                       ret = -ENOSPC;
+                       break;
+               }
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
 const struct bkey_ops bch2_bkey_extent_ops = {
        .key_invalid    = bch2_extent_invalid,
        .key_debugcheck = bch2_extent_debugcheck,
index 1ec2db5e11ba6b0e1b16262a9a8b85445694610a..ab7993abbddfe838b0999b43fdd84deb7f8cddbf 100644 (file)
@@ -45,6 +45,7 @@ bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -243,14 +244,14 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
        case BCH_EXTENT_CRC32:
                return (struct bch_extent_crc_unpacked) {
                        common_fields(crc->crc32),
-                       .csum.lo                = crc->crc32.csum,
+                       .csum.lo                = (__force __le64) crc->crc32.csum,
                };
        case BCH_EXTENT_CRC64:
                return (struct bch_extent_crc_unpacked) {
                        common_fields(crc->crc64),
                        .nonce                  = crc->crc64.nonce,
-                       .csum.lo                = crc->crc64.csum_lo,
-                       .csum.hi                = crc->crc64.csum_hi,
+                       .csum.lo                = (__force __le64) crc->crc64.csum_lo,
+                       .csum.hi                = (__force __le64) crc->crc64.csum_hi,
                };
        case BCH_EXTENT_CRC128:
                return (struct bch_extent_crc_unpacked) {
@@ -425,4 +426,6 @@ bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
 
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
 #endif /* _BCACHEFS_EXTENTS_H */
index 298e3592ed6c8c707577094701a5fe95d12ff28d..2c34a85cb7be75057c75debfcc88a7a1fce7037d 100644 (file)
 
 struct i_sectors_hook {
        struct extent_insert_hook       hook;
-       s64                             sectors;
        struct bch_inode_info           *inode;
+       s64                             sectors;
+       u64                             new_i_size;
+       unsigned                        flags;
+       unsigned                        appending:1;
 };
 
 struct bchfs_write_op {
@@ -43,17 +46,6 @@ struct bchfs_write_op {
        struct bch_write_op             op;
 };
 
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
-                                       struct bch_inode_info *inode,
-                                       bool is_dio)
-{
-       op->inode               = inode;
-       op->sectors_added       = 0;
-       op->is_dio              = is_dio;
-       op->unalloc             = false;
-       op->new_i_size          = U64_MAX;
-}
-
 struct bch_writepage_io {
        struct closure                  cl;
 
@@ -65,12 +57,8 @@ struct dio_write {
        struct closure                  cl;
        struct kiocb                    *req;
        struct bch_fs                   *c;
-       long                            written;
-       long                            error;
        loff_t                          offset;
 
-       struct disk_reservation         res;
-
        struct iovec                    *iovec;
        struct iovec                    inline_vecs[UIO_FASTIOV];
        struct iov_iter                 iter;
@@ -129,12 +117,6 @@ static int inode_set_size(struct bch_inode_info *inode,
        lockdep_assert_held(&inode->ei_update_lock);
 
        bi->bi_size = *new_i_size;
-
-       if (atomic_long_read(&inode->ei_size_dirty_count))
-               bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-       else
-               bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
        return 0;
 }
 
@@ -145,16 +127,16 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
        return __bch2_write_inode(c, inode, inode_set_size, &new_size);
 }
 
-static inline void i_size_dirty_put(struct bch_inode_info *inode)
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
 {
-       atomic_long_dec_bug(&inode->ei_size_dirty_count);
+       inode->v.i_blocks += sectors;
 }
 
-static inline void i_size_dirty_get(struct bch_inode_info *inode)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
 {
-       lockdep_assert_held(&inode->v.i_rwsem);
-
-       atomic_long_inc(&inode->ei_size_dirty_count);
+       mutex_lock(&inode->ei_update_lock);
+       __i_sectors_acct(c, inode, sectors);
+       mutex_unlock(&inode->ei_update_lock);
 }
 
 /* i_sectors accounting: */
@@ -172,90 +154,83 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
        int sign = bkey_extent_is_allocation(&insert->k) -
                (k.k && bkey_extent_is_allocation(k.k));
 
-       EBUG_ON(!(h->inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY));
-       EBUG_ON(!atomic_long_read(&h->inode->ei_sectors_dirty_count));
+       EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
 
        h->sectors += sectors * sign;
 
        return BTREE_INSERT_OK;
 }
 
-static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
-                                    struct bch_inode_unpacked *bi, void *p)
-{
-       BUG_ON(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY);
-
-       bi->bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
-       return 0;
-}
-
-static int inode_clear_i_sectors_dirty(struct bch_inode_info *inode,
-                                      struct bch_inode_unpacked *bi,
-                                      void *p)
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+                                    struct bch_inode_unpacked *bi,
+                                    void *p)
 {
-       BUG_ON(!(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+       struct i_sectors_hook *h = p;
 
-       bi->bi_sectors  = atomic64_read(&inode->ei_sectors);
-       bi->bi_flags    &= ~BCH_INODE_I_SECTORS_DIRTY;
+       if (h->new_i_size != U64_MAX &&
+           (!h->appending ||
+            h->new_i_size > bi->bi_size))
+               bi->bi_size = h->new_i_size;
+       bi->bi_sectors  += h->sectors;
+       bi->bi_flags    &= ~h->flags;
        return 0;
 }
 
-static void i_sectors_dirty_put(struct bch_fs *c,
-                               struct bch_inode_info *inode,
-                               struct i_sectors_hook *h)
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 {
-       if (h->sectors) {
-               spin_lock(&inode->v.i_lock);
-               inode->v.i_blocks += h->sectors;
-               spin_unlock(&inode->v.i_lock);
+       int ret;
 
-               atomic64_add(h->sectors, &inode->ei_sectors);
-               EBUG_ON(atomic64_read(&inode->ei_sectors) < 0);
-       }
+       mutex_lock(&h->inode->ei_update_lock);
+       if (h->new_i_size != U64_MAX)
+               i_size_write(&h->inode->v, h->new_i_size);
 
-       EBUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count) <= 0);
+       __i_sectors_acct(c, h->inode, h->sectors);
 
-       mutex_lock(&inode->ei_update_lock);
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+       mutex_unlock(&h->inode->ei_update_lock);
 
-       if (atomic_long_dec_and_test(&inode->ei_sectors_dirty_count)) {
-               int ret = __bch2_write_inode(c, inode,
-                                         inode_clear_i_sectors_dirty, NULL);
+       h->sectors = 0;
 
-               ret = ret;
-       }
-
-       mutex_unlock(&inode->ei_update_lock);
+       return ret;
 }
 
-static int __must_check i_sectors_dirty_get(struct bch_fs *c,
-                                           struct bch_inode_info *inode,
-                                           struct i_sectors_hook *h)
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+                                   struct bch_inode_unpacked *bi, void *p)
 {
-       int ret = 0;
+       struct i_sectors_hook *h = p;
 
-       h->hook.fn      = i_sectors_hook_fn;
-       h->sectors      = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
-       h->inode        = inode;
-#endif
+       if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+               bi->bi_size = h->new_i_size;
 
-       if (atomic_long_inc_not_zero(&inode->ei_sectors_dirty_count))
-               return 0;
-
-       mutex_lock(&inode->ei_update_lock);
-
-       if (!(inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY))
-               ret = __bch2_write_inode(c, inode, inode_set_i_sectors_dirty,
-                                        NULL);
+       bi->bi_flags |= h->flags;
+       return 0;
+}
 
-       if (!ret)
-               atomic_long_inc(&inode->ei_sectors_dirty_count);
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+       int ret;
 
-       mutex_unlock(&inode->ei_update_lock);
+       mutex_lock(&h->inode->ei_update_lock);
+       ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+       mutex_unlock(&h->inode->ei_update_lock);
 
        return ret;
 }
 
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+       return (struct i_sectors_hook) {
+               .hook.fn        = i_sectors_hook_fn,
+               .inode          = inode,
+               .sectors        = 0,
+               .new_i_size     = U64_MAX,
+               .flags          = flags|BCH_INODE_I_SECTORS_DIRTY,
+       };
+}
+
+/* normal i_size/i_sectors update machinery: */
+
 struct bchfs_extent_trans_hook {
        struct bchfs_write_op           *op;
        struct extent_insert_hook       hook;
@@ -289,18 +264,18 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
        BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
 
        /* XXX: inode->i_size locking */
-       if (offset > inode->ei_size) {
-               BUG_ON(inode->ei_flags & BCH_INODE_I_SIZE_DIRTY);
-
+       if (offset > inode->ei_inode.bi_size) {
                if (!h->need_inode_update) {
                        h->need_inode_update = true;
                        return BTREE_INSERT_NEED_TRAVERSE;
                }
 
+               BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+
                h->inode_u.bi_size = offset;
                do_pack = true;
 
-               inode->ei_size = offset;
+               inode->ei_inode.bi_size = offset;
 
                if (h->op->is_dio)
                        i_size_write(&inode->v, offset);
@@ -315,15 +290,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                h->inode_u.bi_sectors += sectors;
                do_pack = true;
 
-               atomic64_add(sectors, &inode->ei_sectors);
-
                h->op->sectors_added += sectors;
-
-               if (h->op->is_dio) {
-                       spin_lock(&inode->v.i_lock);
-                       inode->v.i_blocks += sectors;
-                       spin_unlock(&inode->v.i_lock);
-               }
        }
 
        if (do_pack)
@@ -340,6 +307,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
        struct btree_iter extent_iter, inode_iter;
        struct bchfs_extent_trans_hook hook;
        struct bkey_i *k = bch2_keylist_front(keys);
+       s64 orig_sectors_added = op->sectors_added;
        int ret;
 
        BUG_ON(k->k.p.inode != op->inode->v.i_ino);
@@ -362,7 +330,8 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
                /* XXX: inode->i_size locking */
                k = bch2_keylist_front(keys);
-               if (min(k->k.p.offset << 9, op->new_i_size) > op->inode->ei_size)
+               if (min(k->k.p.offset << 9, op->new_i_size) >
+                   op->inode->ei_inode.bi_size)
                        hook.need_inode_update = true;
 
                if (hook.need_inode_update) {
@@ -430,9 +399,41 @@ err:
        bch2_btree_iter_unlock(&extent_iter);
        bch2_btree_iter_unlock(&inode_iter);
 
+       if (op->is_dio)
+               i_sectors_acct(wop->c, op->inode,
+                              op->sectors_added - orig_sectors_added);
+
        return ret;
 }
 
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+                                       struct bch_fs *c,
+                                       struct bch_inode_info *inode,
+                                       struct bch_io_opts opts,
+                                       bool is_dio)
+{
+       op->inode               = inode;
+       op->sectors_added       = 0;
+       op->is_dio              = is_dio;
+       op->unalloc             = false;
+       op->new_i_size          = U64_MAX;
+
+       bch2_write_op_init(&op->op, c);
+       op->op.csum_type        = bch2_data_checksum_type(c, opts.data_checksum);
+       op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+       op->op.devs             = c->fastest_devs;
+       op->op.index_update_fn  = bchfs_write_index_update;
+       op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+       struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+       bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+       return opts;
+}
+
 /* page state: */
 
 /* stored in page->private: */
@@ -551,11 +552,8 @@ static void bch2_clear_page_bits(struct page *page)
        s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
        ClearPagePrivate(page);
 
-       if (s.dirty_sectors) {
-               spin_lock(&inode->v.i_lock);
-               inode->v.i_blocks -= s.dirty_sectors;
-               spin_unlock(&inode->v.i_lock);
-       }
+       if (s.dirty_sectors)
+               i_sectors_acct(c, inode, -s.dirty_sectors);
 
        if (s.reserved)
                bch2_disk_reservation_put(c, &res);
@@ -563,19 +561,16 @@ static void bch2_clear_page_bits(struct page *page)
 
 int bch2_set_page_dirty(struct page *page)
 {
+       struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_page_state old, new;
 
        old = page_state_cmpxchg(page_state(page), new,
                new.dirty_sectors = PAGE_SECTORS - new.sectors;
        );
 
-       if (old.dirty_sectors != new.dirty_sectors) {
-               struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-
-               spin_lock(&inode->v.i_lock);
-               inode->v.i_blocks += new.dirty_sectors - old.dirty_sectors;
-               spin_unlock(&inode->v.i_lock);
-       }
+       if (old.dirty_sectors != new.dirty_sectors)
+               i_sectors_acct(c, inode, new.dirty_sectors - old.dirty_sectors);
 
        return __set_page_dirty_nobuffers(page);
 }
@@ -624,7 +619,7 @@ static void bch2_readpages_end_io(struct bio *bio)
        bio_for_each_segment_all(bv, bio, i) {
                struct page *page = bv->bv_page;
 
-               if (!bio->bi_error) {
+               if (!bio->bi_status) {
                        SetPageUptodate(page);
                } else {
                        ClearPageUptodate(page);
@@ -846,6 +841,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 {
        struct bch_inode_info *inode = to_bch_ei(mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts = io_opts(c, inode);
        struct btree_iter iter;
        struct page *page;
        struct readpages_iter readpages_iter = {
@@ -868,7 +864,8 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
                                   c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);
 
                struct bch_read_bio *rbio =
-                       to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
+                       rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+                                 opts);
 
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                bio_add_page_contig(&rbio->bio, page);
@@ -914,9 +911,10 @@ int bch2_readpage(struct file *file, struct page *page)
 {
        struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts = io_opts(c, inode);
        struct bch_read_bio *rbio;
 
-       rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+       rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
        rbio->bio.bi_end_io = bch2_readpages_end_io;
 
        __bchfs_readpage(c, rbio, inode->v.i_ino, page);
@@ -925,8 +923,15 @@ int bch2_readpage(struct file *file, struct page *page)
 
 struct bch_writepage_state {
        struct bch_writepage_io *io;
+       struct bch_io_opts      opts;
 };
 
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+                                                                 struct bch_inode_info *inode)
+{
+       return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
 static void bch2_writepage_io_free(struct closure *cl)
 {
        struct bch_writepage_io *io = container_of(cl,
@@ -982,13 +987,8 @@ static void bch2_writepage_io_done(struct closure *cl)
         * PageWriteback is effectively our ref on the inode - fixup i_blocks
         * before calling end_page_writeback:
         */
-       if (io->op.sectors_added) {
-               struct bch_inode_info *inode = io->op.inode;
-
-               spin_lock(&inode->v.i_lock);
-               inode->v.i_blocks += io->op.sectors_added;
-               spin_unlock(&inode->v.i_lock);
-       }
+       if (io->op.sectors_added)
+               i_sectors_acct(c, io->op.inode, io->op.sectors_added);
 
        bio_for_each_segment_all(bvec, bio, i)
                end_page_writeback(bvec->bv_page);
@@ -1004,8 +1004,6 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
        w->io = NULL;
        atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
 
-       io->op.op.pos.offset = bio->bi_iter.bi_sector;
-
        closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
        continue_at(&io->cl, bch2_writepage_io_done, NULL);
 }
@@ -1017,46 +1015,26 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
 static void bch2_writepage_io_alloc(struct bch_fs *c,
                                    struct bch_writepage_state *w,
                                    struct bch_inode_info *inode,
-                                   struct page *page)
-{
-       u64 inum = inode->v.i_ino;
-       unsigned nr_replicas = page_state(page)->nr_replicas;
-
-       EBUG_ON(!nr_replicas);
-       /* XXX: disk_reservation->gen isn't plumbed through */
-
-       if (!w->io) {
-alloc_io:
-               w->io = container_of(bio_alloc_bioset(GFP_NOFS,
-                                                     BIO_MAX_PAGES,
-                                                     &c->writepage_bioset),
-                                    struct bch_writepage_io, op.op.wbio.bio);
-
-               closure_init(&w->io->cl, NULL);
-               bch2_fswrite_op_init(&w->io->op, inode, false);
-               bch2_write_op_init(&w->io->op.op, c,
-                               (struct disk_reservation) {
-                                       .nr_replicas = c->opts.data_replicas,
-                               },
-                               c->fastest_devs,
-                               writepoint_hashed(inode->ei_last_dirtied),
-                               POS(inum, 0),
-                               &inode->ei_journal_seq,
-                               0);
-               w->io->op.op.index_update_fn = bchfs_write_index_update;
-       }
+                                   struct page *page,
+                                   struct bch_page_state s)
+{
+       struct bch_write_op *op;
+       u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
 
-       if (w->io->op.op.res.nr_replicas != nr_replicas ||
-           bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
-               bch2_writepage_do_io(w);
-               goto alloc_io;
-       }
+       w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+                                             BIO_MAX_PAGES,
+                                             &c->writepage_bioset),
+                            struct bch_writepage_io, op.op.wbio.bio);
+       op = &w->io->op.op;
 
-       /*
-        * We shouldn't ever be handed pages for multiple inodes in a single
-        * pass - right?
-        */
-       BUG_ON(inode != w->io->op.inode);
+       closure_init(&w->io->cl, NULL);
+
+       bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+       op->nr_replicas         = s.nr_replicas;
+       op->res.nr_replicas     = s.nr_replicas;
+       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->pos                 = POS(inode->v.i_ino, offset);
+       op->wbio.bio.bi_iter.bi_sector = offset;
 }
 
 static int __bch2_writepage(struct bch_fs *c, struct page *page,
@@ -1091,32 +1069,39 @@ static int __bch2_writepage(struct bch_fs *c, struct page *page,
         */
        zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
-       bch2_writepage_io_alloc(c, w, inode, page);
-
-       /* while page is locked: */
-       w->io->op.new_i_size = i_size;
-
-       if (wbc->sync_mode == WB_SYNC_ALL)
-               w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
-
        /* Before unlocking the page, transfer reservation to w->io: */
        old = page_state_cmpxchg(page_state(page), new, {
                EBUG_ON(!new.reserved &&
                        (new.sectors != PAGE_SECTORS ||
                        !new.allocated));
 
-               if (new.allocated &&
-                   w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+               if (new.allocated && w->opts.compression)
                        new.allocated = 0;
                else if (!new.reserved)
-                       goto out;
+                       break;
                new.reserved = 0;
        });
 
-       w->io->op.op.res.sectors += PAGE_SECTORS *
-               (old.reserved - new.reserved) *
-               old.nr_replicas;
-out:
+       if (w->io &&
+           (w->io->op.op.res.nr_replicas != old.nr_replicas ||
+            !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+               bch2_writepage_do_io(w);
+
+       if (!w->io)
+               bch2_writepage_io_alloc(c, w, inode, page, old);
+
+       BUG_ON(inode != w->io->op.inode);
+       BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+       if (old.reserved)
+               w->io->op.op.res.sectors += old.nr_replicas * PAGE_SECTORS;
+
+       /* while page is locked: */
+       w->io->op.new_i_size = i_size;
+
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
        BUG_ON(PageWriteback(page));
        set_page_writeback(page);
        unlock_page(page);
@@ -1127,7 +1112,8 @@ out:
 int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state w = { NULL };
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(mapping->host));
        struct pagecache_iter iter;
        struct page *page;
        int ret = 0;
@@ -1275,7 +1261,8 @@ continue_unlock:
 int bch2_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state w = { NULL };
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
        int ret;
 
        ret = __bch2_writepage(c, page, wbc, &w);
@@ -1306,7 +1293,7 @@ static int bch2_read_single_page(struct page *page,
        __bchfs_readpage(c, rbio, inode->v.i_ino, page);
        wait_for_completion(&done);
 
-       ret = rbio->bio.bi_error;
+       ret = blk_status_to_errno(rbio->bio.bi_status);
        bio_put(&rbio->bio);
 
        if (ret < 0)
@@ -1440,8 +1427,8 @@ static void bch2_direct_IO_read_endio(struct bio *bio)
 {
        struct dio_read *dio = bio->bi_private;
 
-       if (bio->bi_error)
-               dio->ret = bio->bi_error;
+       if (bio->bi_status)
+               dio->ret = blk_status_to_errno(bio->bi_status);
 
        closure_put(&dio->cl);
 }
@@ -1456,6 +1443,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
                               struct file *file, struct bch_inode_info *inode,
                               struct iov_iter *iter, loff_t offset)
 {
+       struct bch_io_opts opts = io_opts(c, inode);
        struct dio_read *dio;
        struct bio *bio;
        bool sync = is_sync_kiocb(req);
@@ -1512,7 +1500,7 @@ start:
                ret = bio_iov_iter_get_pages(bio, iter);
                if (ret < 0) {
                        /* XXX: fault inject this path */
-                       bio->bi_error = ret;
+                       bio->bi_status = BLK_STS_RESOURCE;
                        bio_endio(bio);
                        break;
                }
@@ -1523,7 +1511,7 @@ start:
                if (iter->count)
                        closure_get(&dio->cl);
 
-               bch2_read(c, to_rbio(bio), inode->v.i_ino);
+               bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
        }
 
        if (sync) {
@@ -1542,9 +1530,9 @@ static long __bch2_dio_write_complete(struct dio_write *dio)
        struct file *file = dio->req->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct bch_inode_info *inode = file_bch_inode(file);
-       long ret = dio->error ?: dio->written;
+       long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
 
-       bch2_disk_reservation_put(dio->c, &dio->res);
+       bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
 
        __pagecache_block_put(&mapping->add_lock);
        inode_dio_end(&inode->v);
@@ -1569,11 +1557,6 @@ static void bch2_dio_write_done(struct dio_write *dio)
        struct bio_vec *bv;
        int i;
 
-       dio->written += dio->iop.op.written << 9;
-
-       if (dio->iop.op.error)
-               dio->error = dio->iop.op.error;
-
        bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
                put_page(bv->bv_page);
 
@@ -1586,38 +1569,15 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
        struct file *file = dio->req->ki_filp;
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bio *bio = &dio->iop.op.wbio.bio;
-       unsigned flags = 0;
        int ret;
 
-       if ((dio->req->ki_flags & IOCB_DSYNC) &&
-           !dio->c->opts.journal_flush_disabled)
-               flags |= BCH_WRITE_FLUSH;
-
        ret = bio_iov_iter_get_pages(bio, &dio->iter);
        if (ret < 0) {
-               /*
-                * these didn't get initialized, but bch2_dio_write_done() will
-                * look at them:
-                */
-               dio->iop.op.error = 0;
-               dio->iop.op.written = 0;
-               dio->error = ret;
+               dio->iop.op.error = ret;
                return;
        }
 
-       dio->iop.sectors_added  = 0;
-       bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
-                          dio->c->fastest_devs,
-                          writepoint_hashed((unsigned long) dio->task),
-                          POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
-                          &inode->ei_journal_seq,
-                          flags);
-       dio->iop.op.index_update_fn = bchfs_write_index_update;
-
-       if (!dio->iop.unalloc) {
-               dio->res.sectors -= bio_sectors(bio);
-               dio->iop.op.res.sectors = bio_sectors(bio);
-       }
+       dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
 
        task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1632,7 +1592,7 @@ static void bch2_dio_write_loop_async(struct closure *cl)
 
        bch2_dio_write_done(dio);
 
-       if (dio->iter.count && !dio->error) {
+       if (dio->iter.count && !dio->iop.op.error) {
                use_mm(dio->task->mm);
                pagecache_block_get(&mapping->add_lock);
 
@@ -1652,31 +1612,6 @@ static void bch2_dio_write_loop_async(struct closure *cl)
        }
 }
 
-static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
-                                     u64 size)
-{
-       struct btree_iter iter;
-       struct bpos end = pos;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       end.offset += size;
-
-       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
-                            BTREE_ITER_WITH_HOLES, k) {
-               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-                       break;
-
-               if (!bch2_extent_is_fully_allocated(k)) {
-                       ret = -ENOSPC;
-                       break;
-               }
-       }
-       bch2_btree_iter_unlock(&iter);
-
-       return ret;
-}
-
 static int bch2_direct_IO_write(struct bch_fs *c,
                                struct kiocb *req, struct file *file,
                                struct bch_inode_info *inode,
@@ -1703,13 +1638,17 @@ static int bch2_direct_IO_write(struct bch_fs *c,
        closure_init(&dio->cl, NULL);
        dio->req                = req;
        dio->c                  = c;
-       dio->written            = 0;
-       dio->error              = 0;
        dio->offset             = offset;
        dio->iovec              = NULL;
        dio->iter               = *iter;
        dio->task               = current;
-       bch2_fswrite_op_init(&dio->iop, inode, true);
+       bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+       dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task);
+       dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+       if ((dio->req->ki_flags & IOCB_DSYNC) &&
+           !c->opts.journal_flush_disabled)
+               dio->iop.op.flags |= BCH_WRITE_FLUSH;
 
        if (offset + iter->count > inode->v.i_size)
                sync = true;
@@ -1722,7 +1661,7 @@ static int bch2_direct_IO_write(struct bch_fs *c,
         * Have to then guard against racing with truncate (deleting data that
         * we would have been overwriting)
         */
-       ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+       ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
        if (unlikely(ret)) {
                if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
                                                      offset >> 9),
@@ -1735,6 +1674,8 @@ static int bch2_direct_IO_write(struct bch_fs *c,
                dio->iop.unalloc = true;
        }
 
+       dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
+
        inode_dio_begin(&inode->v);
        __pagecache_block_get(&mapping->add_lock);
 
@@ -1744,20 +1685,20 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 
                        closure_sync(&dio->cl);
                        bch2_dio_write_done(dio);
-               } while (dio->iter.count && !dio->error);
+               } while (dio->iter.count && !dio->iop.op.error);
 
                closure_debug_destroy(&dio->cl);
                return __bch2_dio_write_complete(dio);
        } else {
                bch2_do_direct_IO_write(dio);
 
-               if (dio->iter.count && !dio->error) {
+               if (dio->iter.count && !dio->iop.op.error) {
                        if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
                                dio->iovec = kmalloc(dio->iter.nr_segs *
                                                     sizeof(struct iovec),
                                                     GFP_KERNEL);
                                if (!dio->iovec)
-                                       dio->error = -ENOMEM;
+                                       dio->iop.op.error = -ENOMEM;
                        } else {
                                dio->iovec = dio->inline_vecs;
                        }
@@ -1965,11 +1906,11 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
 }
 
-static int __bch2_truncate_page(struct address_space *mapping,
+static int __bch2_truncate_page(struct bch_inode_info *inode,
                                pgoff_t index, loff_t start, loff_t end)
 {
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
        unsigned start_offset = start & (PAGE_SIZE - 1);
        unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
        struct page *page;
@@ -2049,10 +1990,10 @@ out:
        return ret;
 }
 
-static int bch2_truncate_page(struct address_space *mapping, loff_t from)
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
 {
-       return __bch2_truncate_page(mapping, from >> PAGE_SHIFT,
-                                  from, from + PAGE_SIZE);
+       return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+                                   from, from + PAGE_SIZE);
 }
 
 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
@@ -2060,6 +2001,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
        bool shrink = iattr->ia_size <= inode->v.i_size;
+       struct i_sectors_hook i_sectors_hook =
+               i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
        int ret = 0;
 
        inode_dio_wait(&inode->v);
@@ -2069,17 +2012,15 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
        /* sync appends.. */
        /* XXX what protects inode->i_size? */
-       if (iattr->ia_size > inode->ei_size)
+       if (iattr->ia_size > inode->ei_inode.bi_size)
                ret = filemap_write_and_wait_range(mapping,
-                                                  inode->ei_size, S64_MAX);
+                                                  inode->ei_inode.bi_size, S64_MAX);
        if (ret)
                goto err_put_pagecache;
 
-       mutex_lock(&inode->ei_update_lock);
-       i_size_dirty_get(inode);
-       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-       mutex_unlock(&inode->ei_update_lock);
+       i_sectors_hook.new_i_size = iattr->ia_size;
 
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
        if (unlikely(ret))
                goto err;
 
@@ -2090,45 +2031,32 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
         * here (new i_size < current i_size):
         */
        if (shrink) {
-               struct i_sectors_hook i_sectors_hook;
-               int ret;
-
-               ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+               ret = bch2_truncate_page(inode, iattr->ia_size);
                if (unlikely(ret))
                        goto err;
 
-               ret = bch2_truncate_page(inode->v.i_mapping, iattr->ia_size);
-               if (unlikely(ret)) {
-                       i_sectors_dirty_put(c, inode, &i_sectors_hook);
-                       goto err;
-               }
-
                ret = bch2_inode_truncate(c, inode->v.i_ino,
-                                        round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-                                        &i_sectors_hook.hook,
-                                        &inode->ei_journal_seq);
-
-               i_sectors_dirty_put(c, inode, &i_sectors_hook);
-
+                                         round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+                                         &i_sectors_hook.hook,
+                                         &inode->ei_journal_seq);
                if (unlikely(ret))
                        goto err;
        }
 
-       mutex_lock(&inode->ei_update_lock);
        setattr_copy(&inode->v, iattr);
-       inode->v.i_mtime = inode->v.i_ctime = current_fs_time(inode->v.i_sb);
-out:
-       /* clear I_SIZE_DIRTY: */
-       i_size_dirty_put(inode);
-       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-       mutex_unlock(&inode->ei_update_lock);
+       inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+err:
+       /*
+        * On error - in particular, bch2_truncate_page() error - don't clear
+        * I_SIZE_DIRTY, as we've left data above i_size!:
+        */
+       if (ret)
+               i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
 
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err_put_pagecache:
        pagecache_block_put(&mapping->add_lock);
        return ret;
-err:
-       mutex_lock(&inode->ei_update_lock);
-       goto out;
 }
 
 static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
@@ -2144,33 +2072,41 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
        inode_dio_wait(&inode->v);
        pagecache_block_get(&mapping->add_lock);
 
-       ret = __bch2_truncate_page(mapping,
+       ret = __bch2_truncate_page(inode,
                                   offset >> PAGE_SHIFT,
                                   offset, offset + len);
        if (unlikely(ret))
-               goto out;
+               goto err;
 
        if (offset >> PAGE_SHIFT !=
            (offset + len) >> PAGE_SHIFT) {
-               ret = __bch2_truncate_page(mapping,
+               ret = __bch2_truncate_page(inode,
                                           (offset + len) >> PAGE_SHIFT,
                                           offset, offset + len);
                if (unlikely(ret))
-                       goto out;
+                       goto err;
        }
 
        truncate_pagecache_range(&inode->v, offset, offset + len - 1);
 
        if (discard_start < discard_end) {
                struct disk_reservation disk_res;
-               struct i_sectors_hook i_sectors_hook;
+               struct i_sectors_hook i_sectors_hook =
+                       i_sectors_hook_init(inode, 0);
                int ret;
 
-               BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
-
-               ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+               ret = i_sectors_dirty_start(c, &i_sectors_hook);
                if (unlikely(ret))
-                       goto out;
+                       goto err;
+
+               /*
+                * We need to pass in a disk reservation here because we might
+                * be splitting a compressed extent into two. This isn't a
+                * problem with truncate because truncate will never split an
+                * extent, only truncate it...
+                */
+               ret = bch2_disk_reservation_get(c, &disk_res, 0, 0);
+               BUG_ON(ret);
 
                ret = bch2_btree_delete_range(c,
                                BTREE_ID_EXTENTS,
@@ -2180,11 +2116,11 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
                                &disk_res,
                                &i_sectors_hook.hook,
                                &inode->ei_journal_seq);
-
-               i_sectors_dirty_put(c, inode, &i_sectors_hook);
                bch2_disk_reservation_put(c, &disk_res);
+
+               ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
        }
-out:
+err:
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
 
@@ -2200,7 +2136,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        struct btree_iter dst;
        BKEY_PADDED(k) copy;
        struct bkey_s_c k;
-       struct i_sectors_hook i_sectors_hook;
+       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
        loff_t new_size;
        int ret;
 
@@ -2237,7 +2173,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
        if (ret)
                goto err;
 
-       ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
        if (ret)
                goto err;
 
@@ -2278,8 +2214,14 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
                                           BTREE_INSERT_ENTRY(&dst, &copy.k));
                bch2_disk_reservation_put(c, &disk_res);
 btree_iter_err:
-               if (ret < 0 && ret != -EINTR)
-                       goto err_unwind;
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       goto err_put_sectors_dirty;
+               /*
+                * XXX: if we error here we've left data with multiple
+                * pointers... which isn't a _super_ serious problem...
+                */
 
                bch2_btree_iter_cond_resched(&src);
        }
@@ -2292,30 +2234,18 @@ btree_iter_err:
                                 &i_sectors_hook.hook,
                                 &inode->ei_journal_seq);
        if (ret)
-               goto err_unwind;
-
-       i_sectors_dirty_put(c, inode, &i_sectors_hook);
+               goto err_put_sectors_dirty;
 
-       mutex_lock(&inode->ei_update_lock);
        i_size_write(&inode->v, new_size);
-       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
-       mutex_unlock(&inode->ei_update_lock);
-
+       i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
        pagecache_block_put(&mapping->add_lock);
        inode_unlock(&inode->v);
 
-       return ret;
-err_unwind:
-       /*
-        * XXX: we've left data with multiple pointers... which isn't a _super_
-        * serious problem...
-        */
-       i_sectors_dirty_put(c, inode, &i_sectors_hook);
-err:
        bch2_btree_iter_unlock(&src);
        bch2_btree_iter_unlock(&dst);
-       pagecache_block_put(&mapping->add_lock);
-       inode_unlock(&inode->v);
        return ret;
 }
 
@@ -2324,11 +2254,11 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 {
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct i_sectors_hook i_sectors_hook;
+       struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
        struct btree_iter iter;
-       struct bpos end;
+       struct bpos end_pos;
        loff_t block_start, block_end;
-       loff_t new_size = offset + len;
+       loff_t end = offset + len;
        unsigned sectors;
        unsigned replicas = READ_ONCE(c->opts.data_replicas);
        int ret;
@@ -2340,45 +2270,43 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
        inode_dio_wait(&inode->v);
        pagecache_block_get(&mapping->add_lock);
 
-       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-           new_size > inode->v.i_size) {
-               ret = inode_newsize_ok(&inode->v, new_size);
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+               ret = inode_newsize_ok(&inode->v, end);
                if (ret)
                        goto err;
        }
 
        if (mode & FALLOC_FL_ZERO_RANGE) {
-               ret = __bch2_truncate_page(mapping,
+               ret = __bch2_truncate_page(inode,
                                           offset >> PAGE_SHIFT,
-                                          offset, offset + len);
+                                          offset, end);
 
                if (!ret &&
-                   offset >> PAGE_SHIFT !=
-                   (offset + len) >> PAGE_SHIFT)
-                       ret = __bch2_truncate_page(mapping,
-                                                  (offset + len) >> PAGE_SHIFT,
-                                                  offset, offset + len);
+                   offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+                       ret = __bch2_truncate_page(inode,
+                                                  end >> PAGE_SHIFT,
+                                                  offset, end);
 
                if (unlikely(ret))
                        goto err;
 
-               truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+               truncate_pagecache_range(&inode->v, offset, end - 1);
 
                block_start     = round_up(offset, PAGE_SIZE);
-               block_end       = round_down(offset + len, PAGE_SIZE);
+               block_end       = round_down(end, PAGE_SIZE);
        } else {
                block_start     = round_down(offset, PAGE_SIZE);
-               block_end       = round_up(offset + len, PAGE_SIZE);
+               block_end       = round_up(end, PAGE_SIZE);
        }
 
        bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
-       end = POS(inode->v.i_ino, block_end >> 9);
+       end_pos = POS(inode->v.i_ino, block_end >> 9);
 
-       ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+       ret = i_sectors_dirty_start(c, &i_sectors_hook);
        if (unlikely(ret))
                goto err;
 
-       while (bkey_cmp(iter.pos, end) < 0) {
+       while (bkey_cmp(iter.pos, end_pos) < 0) {
                struct disk_reservation disk_res = { 0 };
                struct bkey_i_reservation reservation;
                struct bkey_s_c k;
@@ -2407,7 +2335,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                reservation.k.size      = k.k->size;
 
                bch2_cut_front(iter.pos, &reservation.k_i);
-               bch2_cut_back(end, &reservation.k);
+               bch2_cut_back(end_pos, &reservation.k);
 
                sectors = reservation.k.size;
                reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
@@ -2435,11 +2363,11 @@ btree_iter_err:
        }
        bch2_btree_iter_unlock(&iter);
 
-       i_sectors_dirty_put(c, inode, &i_sectors_hook);
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 
        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-           new_size > inode->v.i_size) {
-               i_size_write(&inode->v, new_size);
+           end > inode->v.i_size) {
+               i_size_write(&inode->v, end);
 
                mutex_lock(&inode->ei_update_lock);
                ret = bch2_write_inode_size(c, inode, inode->v.i_size);
@@ -2449,14 +2377,14 @@ btree_iter_err:
        /* blech */
        if ((mode & FALLOC_FL_KEEP_SIZE) &&
            (mode & FALLOC_FL_ZERO_RANGE) &&
-           inode->ei_size != inode->v.i_size) {
+           inode->ei_inode.bi_size != inode->v.i_size) {
                /* sync appends.. */
                ret = filemap_write_and_wait_range(mapping,
-                                       inode->ei_size, S64_MAX);
+                                       inode->ei_inode.bi_size, S64_MAX);
                if (ret)
                        goto err;
 
-               if (inode->ei_size != inode->v.i_size) {
+               if (inode->ei_inode.bi_size != inode->v.i_size) {
                        mutex_lock(&inode->ei_update_lock);
                        ret = bch2_write_inode_size(c, inode, inode->v.i_size);
                        mutex_unlock(&inode->ei_update_lock);
@@ -2468,7 +2396,7 @@ btree_iter_err:
 
        return 0;
 err_put_sectors_dirty:
-       i_sectors_dirty_put(c, inode, &i_sectors_hook);
+       ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err:
        bch2_btree_iter_unlock(&iter);
        pagecache_block_put(&mapping->add_lock);
@@ -2669,11 +2597,14 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
        if (bioset_init(&c->writepage_bioset,
-                       4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+                       4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+                       BIOSET_NEED_BVECS) ||
            bioset_init(&c->dio_read_bioset,
-                       4, offsetof(struct dio_read, rbio.bio)) ||
+                       4, offsetof(struct dio_read, rbio.bio),
+                       BIOSET_NEED_BVECS) ||
            bioset_init(&c->dio_write_bioset,
-                       4, offsetof(struct dio_write, iop.op.wbio.bio)))
+                       4, offsetof(struct dio_write, iop.op.wbio.bio),
+                       BIOSET_NEED_BVECS))
                return -ENOMEM;
 
        return 0;
index bd915fec044bc7d996cdc9b94de9f92d8884c0ff..24228c8eb2d7f1079b4a21add0ba5fcb94a834ab 100644 (file)
@@ -75,7 +75,7 @@ do {                                                                  \
 /* Set VFS inode flags from bcachefs inode: */
 void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
 {
-       set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
+       set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
 }
 
 static int bch2_inode_flags_set(struct bch_inode_info *inode,
@@ -99,13 +99,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
                return -EINVAL;
 
        bi->bi_flags = newflags;
-       inode->v.i_ctime = current_fs_time(inode->v.i_sb);
+       inode->v.i_ctime = current_time(&inode->v);
        return 0;
 }
 
 static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
 {
-       unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
+       unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
 
        return put_user(flags, arg);
 }
@@ -153,7 +153,7 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 {
        struct fsxattr fa = { 0 };
 
-       fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
+       fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
 
        return copy_to_user(arg, &fa, sizeof(fa));
 }
index 43688cd34de9a992a3c7833be717fa08488a998f..cb0397f1343beac8186a18e703d256f642b25eab 100644 (file)
@@ -12,6 +12,7 @@
 #include "fs-ioctl.h"
 #include "fsck.h"
 #include "inode.h"
+#include "io.h"
 #include "journal.h"
 #include "keylist.h"
 #include "super.h"
@@ -130,10 +131,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
                                BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
        } while (ret == -EINTR);
 
-       if (!ret) {
-               inode->ei_size  = inode_u.bi_size;
-               inode->ei_flags = inode_u.bi_flags;
-       }
+       if (!ret)
+               inode->ei_inode = inode_u;
 out:
        bch2_btree_iter_unlock(&iter);
 
@@ -146,7 +145,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
        return __bch2_write_inode(c, inode, NULL, NULL);
 }
 
-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
        int ret;
 
@@ -158,7 +157,7 @@ int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
        return ret;
 }
 
-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
 {
        int ret = 0;
 
@@ -223,7 +222,9 @@ static struct bch_inode_info *bch2_vfs_inode_create(struct bch_fs *c,
        bch2_inode_init(c, &inode_u,
                        i_uid_read(&inode->v),
                        i_gid_read(&inode->v),
-                       inode->v.i_mode, rdev);
+                       inode->v.i_mode, rdev,
+                       &dir->ei_inode);
+
        ret = bch2_inode_create(c, &inode_u,
                                BLOCKDEV_INODE_MAX, 0,
                                &c->unused_inode_hint);
@@ -277,7 +278,7 @@ static int bch2_vfs_dirent_create(struct bch_fs *c,
        if (unlikely(ret))
                return ret;
 
-       dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
+       dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
        mark_inode_dirty_sync(&dir->v);
        return 0;
 }
@@ -344,7 +345,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
 
        lockdep_assert_held(&inode->v.i_rwsem);
 
-       inode->v.i_ctime = current_fs_time(dir->v.i_sb);
+       inode->v.i_ctime = current_time(&dir->v);
 
        ret = bch2_inc_nlink(c, inode);
        if (ret)
@@ -473,7 +474,7 @@ static int bch2_rename(struct bch_fs *c,
 {
        struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
        struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_fs_time(old_dir->v.i_sb);
+       struct timespec now = current_time(&old_dir->v);
        int ret;
 
        lockdep_assert_held(&old_dir->v.i_rwsem);
@@ -551,7 +552,7 @@ static int bch2_rename_exchange(struct bch_fs *c,
 {
        struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
        struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
-       struct timespec now = current_fs_time(old_dir->v.i_sb);
+       struct timespec now = current_time(&old_dir->v);
        int ret;
 
        ret = bch2_dirent_rename(c,
@@ -909,10 +910,8 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
        inode->v.i_ctime        = bch2_time_to_timespec(c, bi->bi_ctime);
 
        inode->ei_journal_seq   = 0;
-       inode->ei_size          = bi->bi_size;
-       inode->ei_flags         = bi->bi_flags;
-       atomic64_set(&inode->ei_sectors, bi->bi_sectors);
        inode->ei_str_hash      = bch2_hash_info_init(c, bi);
+       inode->ei_inode         = *bi;
 
        bch2_inode_flags_to_vfs(inode);
 
@@ -949,8 +948,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
        inode_init_once(&inode->v);
        mutex_init(&inode->ei_update_lock);
        inode->ei_journal_seq = 0;
-       atomic_long_set(&inode->ei_size_dirty_count, 0);
-       atomic_long_set(&inode->ei_sectors_dirty_count, 0);
 
        return &inode->v;
 }
@@ -995,12 +992,6 @@ static void bch2_evict_inode(struct inode *vinode)
 
        truncate_inode_pages_final(&inode->v.i_data);
 
-       if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
-               /* XXX - we want to check this stuff iff there weren't IO errors: */
-               BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
-               BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
-       }
-
        clear_inode(&inode->v);
 
        if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
@@ -1272,9 +1263,16 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
        sb->s_magic             = BCACHEFS_STATFS_MAGIC;
        sb->s_time_gran         = c->sb.time_precision;
        c->vfs_sb               = sb;
-       sb->s_bdi               = &c->bdi;
        strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
 
+       ret = super_setup_bdi(sb);
+       if (ret)
+               goto err_put_super;
+
+       sb->s_bdi->congested_fn         = bch2_congested;
+       sb->s_bdi->congested_data       = c;
+       sb->s_bdi->ra_pages             = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
        for_each_online_member(ca, c, i) {
                struct block_device *bdev = ca->disk_sb.bdev;
 
index d255ca7c375498f86e5a5e2b6fc4bffd87e27f77..652105fb6ac840d8b1d057a09cc93d466020bb17 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _BCACHEFS_FS_H
 #define _BCACHEFS_FS_H
 
+#include "opts.h"
 #include "str_hash.h"
 
 #include <linux/seqlock.h>
@@ -11,22 +12,12 @@ struct bch_inode_info {
 
        struct mutex            ei_update_lock;
        u64                     ei_journal_seq;
-
-       atomic_long_t           ei_size_dirty_count;
-
-       /*
-        * these are updated whenever we update the inode in the btree - for
-        * e.g. fsync
-        */
-       u64                     ei_size;
-       u32                     ei_flags;
-
-       atomic_long_t           ei_sectors_dirty_count;
-       atomic64_t              ei_sectors;
+       unsigned long           ei_last_dirtied;
 
        struct bch_hash_info    ei_str_hash;
 
-       unsigned long           ei_last_dirtied;
+       /* copy of inode in btree: */
+       struct bch_inode_unpacked ei_inode;
 };
 
 #define to_bch_ei(_inode)                                      \
index 4760b16e8cc0acfe6f7fcbdaa7f8fbc7e5816fe1..696926fef8b4be67e7117a5961a82c032f4a6023 100644 (file)
@@ -204,7 +204,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
                        "hash table key at wrong offset: %llu, "
                        "hashed to %llu chain starts at %llu\n%s",
                        k.k->p.offset, hashed, h->chain.pos.offset,
-                       bch2_bkey_val_to_text(c, desc.btree_id,
+                       bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
                                              buf, sizeof(buf), k))) {
                ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
                if (ret) {
@@ -224,7 +224,7 @@ static int hash_check_key(const struct bch_hash_desc desc,
                if (fsck_err_on(k2.k->type == desc.key_type &&
                                !desc.cmp_bkey(k, k2), c,
                                "duplicate hash table keys:\n%s",
-                               bch2_bkey_val_to_text(c, desc.btree_id,
+                               bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
                                                      buf, sizeof(buf), k))) {
                        ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
                        if (ret)
@@ -397,9 +397,9 @@ static int check_dirents(struct bch_fs *c)
 
                if (fsck_err_on(have_target &&
                                d.v->d_type !=
-                               mode_to_type(le16_to_cpu(target.bi_mode)), c,
+                               mode_to_type(target.bi_mode), c,
                                "incorrect d_type: should be %u:\n%s",
-                               mode_to_type(le16_to_cpu(target.bi_mode)),
+                               mode_to_type(target.bi_mode),
                                bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
                                                      buf, sizeof(buf), k))) {
                        struct bkey_i_dirent *n;
@@ -411,7 +411,7 @@ static int check_dirents(struct bch_fs *c)
                        }
 
                        bkey_reassemble(&n->k_i, d.s_c);
-                       n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
+                       n->v.d_type = mode_to_type(target.bi_mode);
 
                        ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
                                        BTREE_INSERT_NOFAIL,
@@ -493,7 +493,8 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
 fsck_err:
        return ret;
 create_root:
-       bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+       bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+                       0, NULL);
        root_inode->bi_inum = BCACHEFS_ROOT_INO;
 
        bch2_inode_pack(&packed, root_inode);
@@ -545,7 +546,8 @@ create_lostfound:
        if (ret)
                return ret;
 
-       bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+       bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+                       0, root_inode);
 
        ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
                               &c->unused_inode_hint);
index 05f617aeaea77f5c0acca74cb4565bcc415bf543..71a24cc6688690d4e84576954acfe492858b81e5 100644 (file)
@@ -198,6 +198,12 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
                if (bch2_inode_unpack(inode, &unpacked))
                        return "invalid variable length fields";
 
+               if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+                       return "invalid data checksum type";
+
+               if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+                       return "invalid data checksum type";
+
                return NULL;
        }
        case BCH_INODE_BLOCKDEV:
@@ -221,6 +227,7 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 static void bch2_inode_to_text(struct bch_fs *c, char *buf,
                               size_t size, struct bkey_s_c k)
 {
+       char *out = buf, *end = out + size;
        struct bkey_s_c_inode inode;
        struct bch_inode_unpacked unpacked;
 
@@ -228,11 +235,14 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
        case BCH_INODE_FS:
                inode = bkey_s_c_to_inode(k);
                if (bch2_inode_unpack(inode, &unpacked)) {
-                       scnprintf(buf, size, "(unpack error)");
+                       out += scnprintf(out, end - out, "(unpack error)");
                        break;
                }
 
-               scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
+#define BCH_INODE_FIELD(_name, _bits)                                          \
+               out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+               BCH_INODE_FIELDS()
+#undef  BCH_INODE_FIELD
                break;
        }
 }
@@ -243,9 +253,12 @@ const struct bkey_ops bch2_bkey_inode_ops = {
 };
 
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
-                    uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
+                    uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+                    struct bch_inode_unpacked *parent)
 {
-       s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
+       s64 now = timespec_to_bch2_time(c,
+               timespec_trunc(current_kernel_time(),
+                              c->sb.time_precision));
 
        memset(inode_u, 0, sizeof(*inode_u));
 
@@ -261,6 +274,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
        inode_u->bi_mtime       = now;
        inode_u->bi_ctime       = now;
        inode_u->bi_otime       = now;
+
+       if (parent) {
+#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
+               BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+       }
 }
 
 int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -416,7 +435,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
                        struct bch_inode_unpacked inode_u;
 
                        if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
-                               bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
+                               bi_generation = inode_u.bi_generation + 1;
                        break;
                }
                case BCH_INODE_GENERATION: {
index 53c70617b2890f50f0cc235386d29428f8075fe4..8ebb6fb6d6d0240cad670869198e0579686a7287 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _BCACHEFS_INODE_H
 #define _BCACHEFS_INODE_H
 
+#include "opts.h"
+
 #include <linux/math64.h>
 
 extern const struct bkey_ops bch2_bkey_inode_ops;
@@ -28,7 +30,8 @@ void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *)
 int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
-                   uid_t, gid_t, umode_t, dev_t);
+                    uid_t, gid_t, umode_t, dev_t,
+                    struct bch_inode_unpacked *);
 int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
                      u64, u64, u64 *);
 int bch2_inode_truncate(struct bch_fs *, u64, u64,
@@ -55,6 +58,45 @@ static inline u64 timespec_to_bch2_time(struct bch_fs *c, struct timespec ts)
        return div_s64(ns, c->sb.time_precision);
 }
 
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+       struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (inode->bi_##_name)                                          \
+               opt_set(ret, _name, inode->bi_##_name - 1);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+                                       enum bch_opt_id id, u64 v)
+{
+       switch (id) {
+#define BCH_INODE_OPT(_name, ...)                                      \
+       case Opt_##_name:                                               \
+               inode->bi_##_name = v;                                  \
+               break;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       default:
+               BUG();
+       }
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+                                     enum bch_opt_id id, u64 v)
+{
+       return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+                                       enum bch_opt_id id)
+{
+       return __bch2_inode_opt_set(inode, id, 0);
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_inode_pack_test(void);
 #else
index 0c41e4111a43a0fb25ad9316b7c66c6ca6e15e40..3369a2ffccfa4e045ef8250e896bd55b7028bd62 100644 (file)
@@ -20,6 +20,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "super.h"
 #include "super-io.h"
 
 #include <linux/blkdev.h>
@@ -139,7 +140,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
        const struct bch_extent_ptr *ptr;
        struct bch_write_bio *n;
        struct bch_dev *ca;
-       unsigned ptr_idx = 0;
 
        BUG_ON(c->opts.nochanges);
 
@@ -147,7 +147,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
                       !c->devs[ptr->dev]);
 
-               ca = c->devs[ptr->dev];
+               ca = bch_dev_bkey_exists(c, ptr->dev);
 
                if (ptr + 1 < &extent_entry_last(e)->ptr) {
                        n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@@ -168,7 +168,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
                n->c                    = c;
                n->ca                   = ca;
-               n->ptr_idx              = ptr_idx++;
                n->submit_time_us       = local_clock_us();
                n->bio.bi_iter.bi_sector = ptr->offset;
 
@@ -184,7 +183,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                        submit_bio(&n->bio);
                } else {
                        n->have_io_ref          = false;
-                       bcache_io_error(c, &n->bio, "device has been removed");
+                       n->bio.bi_status        = BLK_STS_REMOVED;
                        bio_endio(&n->bio);
                }
        }
@@ -201,9 +200,12 @@ static void bch2_write_done(struct closure *cl)
        if (!op->error && (op->flags & BCH_WRITE_FLUSH))
                op->error = bch2_journal_error(&op->c->journal);
 
-       bch2_disk_reservation_put(op->c, &op->res);
+       if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+               bch2_disk_reservation_put(op->c, &op->res);
        percpu_ref_put(&op->c->writes);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
+       op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
+
        closure_return(cl);
 }
 
@@ -244,9 +246,37 @@ static void bch2_write_index(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bch_fs *c = op->c;
        struct keylist *keys = &op->insert_keys;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       struct bkey_i *src, *dst = keys->keys, *n;
+       int ret;
 
        op->flags |= BCH_WRITE_LOOPED;
 
+       for (src = keys->keys; src != keys->top; src = n) {
+               n = bkey_next(src);
+               bkey_copy(dst, src);
+
+               e = bkey_i_to_s_extent(dst);
+               extent_for_each_ptr_backwards(e, ptr)
+                       if (test_bit(ptr->dev, op->failed.d))
+                               bch2_extent_drop_ptr(e, ptr);
+
+               ret = bch2_extent_nr_ptrs(e.c)
+                       ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+                       : -EIO;
+               if (ret) {
+                       keys->top = keys->keys;
+                       op->error = ret;
+                       op->flags |= BCH_WRITE_DONE;
+                       goto err;
+               }
+
+               dst = bkey_next(dst);
+       }
+
+       keys->top = dst;
+
        if (!bch2_keylist_empty(keys)) {
                u64 sectors_start = keylist_sectors(keys);
                int ret = op->index_update_fn(op);
@@ -260,7 +290,7 @@ static void bch2_write_index(struct closure *cl)
                        op->error = ret;
                }
        }
-
+err:
        bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
 
        if (!(op->flags & BCH_WRITE_DONE))
@@ -276,43 +306,6 @@ static void bch2_write_index(struct closure *cl)
        }
 }
 
-static void bch2_write_io_error(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct keylist *keys = &op->insert_keys;
-       struct bch_fs *c = op->c;
-       struct bch_extent_ptr *ptr;
-       struct bkey_i *k;
-       int ret;
-
-       for_each_keylist_key(keys, k) {
-               struct bkey_i *n = bkey_next(k);
-               struct bkey_s_extent e = bkey_i_to_s_extent(k);
-
-               extent_for_each_ptr_backwards(e, ptr)
-                       if (test_bit(ptr->dev, op->failed.d))
-                               bch2_extent_drop_ptr(e, ptr);
-
-               memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
-               keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
-
-               ret = bch2_extent_nr_ptrs(e.c)
-                       ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
-                       : -EIO;
-               if (ret) {
-                       keys->top = keys->keys;
-                       op->error = ret;
-                       op->flags |= BCH_WRITE_DONE;
-                       break;
-               }
-       }
-
-       memset(&op->failed, 0, sizeof(op->failed));
-
-       bch2_write_index(cl);
-       return;
-}
-
 static void bch2_write_endio(struct bio *bio)
 {
        struct closure *cl              = bio->bi_private;
@@ -324,10 +317,8 @@ static void bch2_write_endio(struct bio *bio)
 
        bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
 
-       if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
                set_bit(ca->dev_idx, op->failed.d);
-               set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
-       }
 
        if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
@@ -706,11 +697,6 @@ do_write:
 
        key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
-       ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
-                                   BCH_DATA_USER);
-       if (ret)
-               goto err;
-
        dst->bi_end_io  = bch2_write_endio;
        dst->bi_private = &op->cl;
        bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@@ -870,7 +856,8 @@ void bch2_write(struct closure *cl)
            !percpu_ref_tryget(&c->writes)) {
                __bcache_io_error(c, "read only");
                op->error = -EROFS;
-               bch2_disk_reservation_put(c, &op->res);
+               if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+                       bch2_disk_reservation_put(c, &op->res);
                closure_return(cl);
        }
 
@@ -916,7 +903,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
        swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
        rbio->promote = NULL;
 
-       __bch2_write_op_init(&op->write.op, c);
+       bch2_write_op_init(&op->write.op, c);
+       op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
+       op->write.op.compression_type =
+               bch2_compression_opt_to_type(rbio->opts.compression);
 
        op->write.move_dev      = -1;
        op->write.op.devs       = c->fastest_devs;
@@ -1060,7 +1050,7 @@ static void bch2_rbio_retry(struct work_struct *work)
        if (rbio->split)
                rbio = bch2_rbio_free(rbio);
        else
-               rbio->bio.bi_error = 0;
+               rbio->bio.bi_status = 0;
 
        if (!(flags & BCH_READ_NODECODE))
                flags |= BCH_READ_MUST_CLONE;
@@ -1073,7 +1063,8 @@ static void bch2_rbio_retry(struct work_struct *work)
                __bch2_read(c, rbio, iter, inode, &avoid, flags);
 }
 
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+                           blk_status_t error)
 {
        rbio->retry = retry;
 
@@ -1081,7 +1072,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
                return;
 
        if (retry == READ_ERR) {
-               bch2_rbio_parent(rbio)->bio.bi_error = error;
+               bch2_rbio_parent(rbio)->bio.bi_status = error;
                bch2_rbio_done(rbio);
        } else {
                bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1236,7 +1227,7 @@ csum_err:
         */
        if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
                rbio->flags |= BCH_READ_MUST_BOUNCE;
-               bch2_rbio_error(rbio, READ_RETRY, -EIO);
+               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
                return;
        }
 
@@ -1245,13 +1236,13 @@ csum_err:
                rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
                rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
                csum.hi, csum.lo, crc.csum_type);
-       bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
        return;
 decompression_err:
        __bcache_io_error(c, "decompression error, inode %llu offset %llu",
                          rbio->pos.inode,
                          (u64) rbio->bvec_iter.bi_sector);
-       bch2_rbio_error(rbio, READ_ERR, -EIO);
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
        return;
 }
 
@@ -1270,8 +1261,8 @@ static void bch2_read_endio(struct bio *bio)
        if (!rbio->split)
                rbio->bio.bi_end_io = rbio->end_io;
 
-       if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
-               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
+       if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
                return;
        }
 
@@ -1281,9 +1272,9 @@ static void bch2_read_endio(struct bio *bio)
                atomic_long_inc(&c->read_realloc_races);
 
                if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-                       bch2_rbio_error(rbio, READ_RETRY, -EINTR);
+                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
                else
-                       bch2_rbio_error(rbio, READ_ERR, -EINTR);
+                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
                return;
        }
 
@@ -1360,7 +1351,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 
                rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
                                        DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                       &c->bio_read_split));
+                                       &c->bio_read_split),
+                                orig->opts);
 
                bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
                split = true;
@@ -1374,7 +1366,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
                 * lose the error)
                 */
                rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
-                                               &c->bio_read_split));
+                                               &c->bio_read_split),
+                                orig->opts);
                rbio->bio.bi_iter = iter;
                split = true;
        } else {
@@ -1428,6 +1421,8 @@ noclone:
                bch2_read_endio(&rbio->bio);
 
                ret = rbio->retry;
+               if (rbio->split)
+                       rbio = bch2_rbio_free(rbio);
                if (!ret)
                        bch2_rbio_done(rbio);
        }
@@ -1503,7 +1498,7 @@ err:
         * possibly bigger than the memory that was
         * originally allocated)
         */
-       rbio->bio.bi_error = -EINTR;
+       rbio->bio.bi_status = BLK_STS_AGAIN;
        bio_endio(&rbio->bio);
        return;
 }
@@ -1561,6 +1556,7 @@ retry:
                        case READ_RETRY:
                                goto retry;
                        case READ_ERR:
+                               rbio->bio.bi_status = BLK_STS_IOERR;
                                bio_endio(&rbio->bio);
                                return;
                        };
index bd0d7c43c7a1fbff3208c2d80f8ab77e2ec4b8da..0c145eb67317660eb65614225d4eb757389a714f 100644 (file)
@@ -21,6 +21,8 @@ void bch2_latency_acct(struct bch_dev *, unsigned, int);
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
                               enum bch_data_type, const struct bkey_i *);
 
+#define BLK_STS_REMOVED                ((__force blk_status_t)128)
+
 enum bch_write_flags {
        BCH_WRITE_ALLOC_NOWAIT          = (1 << 0),
        BCH_WRITE_CACHED                = (1 << 1),
@@ -29,11 +31,12 @@ enum bch_write_flags {
        BCH_WRITE_PAGES_STABLE          = (1 << 4),
        BCH_WRITE_PAGES_OWNED           = (1 << 5),
        BCH_WRITE_ONLY_SPECIFIED_DEVS   = (1 << 6),
+       BCH_WRITE_NOPUT_RESERVATION     = (1 << 7),
 
        /* Internal: */
-       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 7),
-       BCH_WRITE_DONE                  = (1 << 8),
-       BCH_WRITE_LOOPED                = (1 << 9),
+       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 8),
+       BCH_WRITE_DONE                  = (1 << 9),
+       BCH_WRITE_LOOPED                = (1 << 10),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -42,6 +45,12 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
                ? op->journal_seq_p : &op->journal_seq;
 }
 
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+       op->journal_seq_p = journal_seq;
+       op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
        return op->alloc_reserve == RESERVE_MOVINGGC
@@ -51,14 +60,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 
 int bch2_write_index_default(struct bch_write_op *);
 
-static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
 {
        op->c                   = c;
        op->io_wq               = index_update_wq(op);
        op->flags               = 0;
        op->written             = 0;
        op->error               = 0;
-       op->csum_type           = bch2_data_checksum_type(c);
+       op->csum_type           = bch2_data_checksum_type(c, c->opts.data_checksum);
        op->compression_type    =
                bch2_compression_opt_to_type(c->opts.compression);
        op->nr_replicas         = 0;
@@ -75,27 +84,6 @@ static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *
        op->index_update_fn     = bch2_write_index_default;
 }
 
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                                     struct disk_reservation res,
-                                     struct bch_devs_mask *devs,
-                                     struct write_point_specifier write_point,
-                                     struct bpos pos,
-                                     u64 *journal_seq, unsigned flags)
-{
-       __bch2_write_op_init(op, c);
-       op->flags       = flags;
-       op->nr_replicas = res.nr_replicas;
-       op->pos         = pos;
-       op->res         = res;
-       op->devs        = devs;
-       op->write_point = write_point;
-
-       if (journal_seq) {
-               op->journal_seq_p = journal_seq;
-               op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-       }
-}
-
 void bch2_write(struct closure *);
 
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
@@ -134,25 +122,27 @@ static inline void bch2_read_extent(struct bch_fs *c,
                                    struct extent_pick_ptr *pick,
                                    unsigned flags)
 {
-       rbio->_state = 0;
        __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
 }
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
                             u64 inode)
 {
-       rbio->_state = 0;
+       BUG_ON(rbio->_state);
        __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
                    BCH_READ_RETRY_IF_STALE|
                    BCH_READ_MAY_PROMOTE|
                    BCH_READ_USER_MAPPED);
 }
 
-static inline struct bch_read_bio *rbio_init(struct bio *bio)
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+                                            struct bch_io_opts opts)
 {
        struct bch_read_bio *rbio = to_rbio(bio);
 
-       rbio->_state = 0;
+       rbio->_state    = 0;
+       rbio->promote   = NULL;
+       rbio->opts      = opts;
        return rbio;
 }
 
index ed9a4bbe39297c13b1770f4672d0878c138316c0..ff18fdc90eb7822c7d5765e84d47fd9b3fa6f416 100644 (file)
@@ -6,6 +6,7 @@
 #include "buckets_types.h"
 #include "extents_types.h"
 #include "keylist_types.h"
+#include "opts.h"
 #include "super_types.h"
 
 #include <linux/llist.h>
@@ -56,6 +57,8 @@ struct bch_read_bio {
 
        struct promote_op       *promote;
 
+       struct bch_io_opts      opts;
+
        struct work_struct      work;
 
        struct bio              bio;
@@ -69,8 +72,7 @@ struct bch_write_bio {
        struct closure          *cl;
        };
 
-       u8                      ptr_idx;
-       u8                      replicas_failed;
+       struct bch_devs_list    failed;
        u8                      order;
 
        unsigned                split:1,
@@ -90,8 +92,8 @@ struct bch_write_op {
        struct bch_fs           *c;
        struct workqueue_struct *io_wq;
 
+       unsigned                written; /* sectors */
        u16                     flags;
-       u16                     written; /* sectors */
        s8                      error;
 
        unsigned                csum_type:4;
index 5d9a298d6f8f4a3a2c42d09154ac0835a9b69099..b4e149ac9a51d6e604c0c9bf1062a423a666c7c3 100644 (file)
@@ -338,8 +338,8 @@ struct journal_list {
  * Given a journal entry we just read, add it to the list of journal entries to
  * be replayed:
  */
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
-                   struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+                            struct journal_list *jlist, struct jset *j)
 {
        struct journal_replay *i, *pos;
        struct list_head *where;
@@ -347,8 +347,6 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
        __le64 last_seq;
        int ret;
 
-       mutex_lock(&jlist->lock);
-
        last_seq = !list_empty(jlist->head)
                ? list_last_entry(jlist->head, struct journal_replay,
                                  list)->j.last_seq
@@ -376,9 +374,7 @@ static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
                                    memcmp(j, &i->j, bytes), c,
                                    "found duplicate but non identical journal entries (seq %llu)",
                                    le64_to_cpu(j->seq));
-
-                       ret = JOURNAL_ENTRY_ADD_OK;
-                       goto out;
+                       goto found;
                }
 
                if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
@@ -395,12 +391,16 @@ add:
                goto out;
        }
 
-       memcpy(&i->j, j, bytes);
        list_add(&i->list, where);
+       i->devs.nr = 0;
+       memcpy(&i->j, j, bytes);
+found:
+       if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+                        c, "duplicate journal entries on same device"))
+               bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
        ret = JOURNAL_ENTRY_ADD_OK;
 out:
 fsck_err:
-       mutex_unlock(&jlist->lock);
        return ret;
 }
 
@@ -496,8 +496,8 @@ fsck_err:
 #define journal_entry_err_on(cond, c, msg, ...)                                \
        ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
 
-static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
-                                   int write)
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+                                         int write)
 {
        struct jset_entry *entry;
        int ret = 0;
@@ -508,7 +508,7 @@ static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
                if (journal_entry_err_on(vstruct_next(entry) >
                                         vstruct_last(j), c,
                                "journal entry extends past end of jset")) {
-                       j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+                       j->u64s = cpu_to_le32((u64 *) entry - j->_data);
                        break;
                }
 
@@ -614,7 +614,7 @@ static int journal_entry_validate(struct bch_fs *c,
                        "invalid journal entry: last_seq > seq"))
                j->last_seq = j->seq;
 
-       return __journal_entry_validate(c, j, write);
+       return 0;
 fsck_err:
        return ret;
 }
@@ -722,7 +722,10 @@ reread:                    sectors_read = min_t(unsigned,
 
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
-               ret = journal_entry_add(c, jlist, j);
+               mutex_lock(&jlist->lock);
+               ret = journal_entry_add(c, ca, jlist, j);
+               mutex_unlock(&jlist->lock);
+
                switch (ret) {
                case JOURNAL_ENTRY_ADD_OK:
                        *entries_found = true;
@@ -916,7 +919,9 @@ static int journal_seq_blacklist_read(struct journal *j,
 
        for_each_jset_entry_type(entry, &i->j,
                        JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
-               seq = le64_to_cpu(entry->_data[0]);
+               struct jset_entry_blacklist *bl_entry =
+                       container_of(entry, struct jset_entry_blacklist, entry);
+               seq = le64_to_cpu(bl_entry->seq);
 
                bch_verbose(c, "blacklisting existing journal seq %llu", seq);
 
@@ -982,6 +987,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        fsck_err_on(c->sb.clean && journal_has_keys(list), c,
                    "filesystem marked clean but journal has keys to replay");
 
+       list_for_each_entry(i, list, list) {
+               ret = journal_entry_validate_entries(c, &i->j, READ);
+               if (ret)
+                       goto fsck_err;
+       }
+
        i = list_last_entry(list, struct journal_replay, list);
 
        unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
@@ -1002,6 +1013,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                INIT_LIST_HEAD(&p->list);
                INIT_LIST_HEAD(&p->flushed);
                atomic_set(&p->count, 0);
+               p->devs.nr = 0;
        }
 
        mutex_lock(&j->blacklist_lock);
@@ -1010,6 +1022,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
 
                atomic_set(&p->count, 1);
+               p->devs = i->devs;
 
                if (journal_seq_blacklist_read(j, i, p)) {
                        mutex_unlock(&j->blacklist_lock);
@@ -1090,7 +1103,7 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 {
        struct journal_buf *w = journal_prev_buf(j);
 
-       atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
+       atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
 
        if (!need_write_just_set &&
            test_bit(JOURNAL_NEED_WRITE, &j->flags))
@@ -1122,6 +1135,7 @@ static void __journal_entry_new(struct journal *j, int count)
        INIT_LIST_HEAD(&p->list);
        INIT_LIST_HEAD(&p->flushed);
        atomic_set(&p->count, count);
+       p->devs.nr = 0;
 }
 
 static void __bch2_journal_next_entry(struct journal *j)
@@ -1851,6 +1865,21 @@ void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
                   bch2_journal_error(j));
 }
 
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool flush;
+
+       bch2_journal_flush_pins(j, U64_MAX);
+
+       spin_lock(&j->lock);
+       flush = last_seq(j) != j->last_seq_ondisk ||
+               c->btree_roots_dirty;
+       spin_unlock(&j->lock);
+
+       return flush ? bch2_journal_meta(j) : 0;
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
        bool ret;
@@ -2002,7 +2031,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
         * i.e. whichever device was limiting the current journal entry size.
         */
        extent_for_each_ptr_backwards(e, ptr) {
-               ca = c->devs[ptr->dev];
+                  ca = bch_dev_bkey_exists(c, ptr->dev);
 
                if (ca->mi.state != BCH_MEMBER_STATE_RW ||
                    ca->journal.sectors_free <= sectors)
@@ -2197,7 +2226,7 @@ static void journal_write_endio(struct bio *bio)
        struct bch_dev *ca = bio->bi_private;
        struct journal *j = &ca->fs->journal;
 
-       if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
            bch2_meta_write_fault("journal")) {
                /* Was this a flush or an actual journal write? */
                if (ca->journal.ptr_idx != U8_MAX) {
@@ -2233,6 +2262,7 @@ static void journal_write(struct closure *cl)
                if (r->alive)
                        bch2_journal_add_btree_root(w, i, &r->key, r->level);
        }
+       c->btree_roots_dirty = false;
        mutex_unlock(&c->btree_root_lock);
 
        journal_write_compact(jset);
@@ -2246,7 +2276,7 @@ static void journal_write(struct closure *cl)
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-           __journal_entry_validate(c, jset, WRITE))
+           journal_entry_validate_entries(c, jset, WRITE))
                goto err;
 
        bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
@@ -2257,7 +2287,7 @@ static void journal_write(struct closure *cl)
                                  journal_nonce(jset), jset);
 
        if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-           __journal_entry_validate(c, jset, WRITE))
+           journal_entry_validate_entries(c, jset, WRITE))
                goto err;
 
        sectors = vstruct_sectors(jset, c->block_bits);
@@ -2277,6 +2307,9 @@ static void journal_write(struct closure *cl)
                                  BCH_DATA_JOURNAL))
                goto err;
 
+       journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+                       bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
        /*
         * XXX: we really should just disable the entire journal in nochanges
         * mode
@@ -2285,7 +2318,7 @@ static void journal_write(struct closure *cl)
                goto no_io;
 
        extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
-               ca = c->devs[ptr->dev];
+               ca = bch_dev_bkey_exists(c, ptr->dev);
                if (!percpu_ref_tryget(&ca->io_ref)) {
                        /* XXX: fix this */
                        bch_err(c, "missing device for journal write\n");
@@ -2693,6 +2726,46 @@ int bch2_journal_flush(struct journal *j)
        return bch2_journal_flush_seq(j, seq);
 }
 
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_entry_pin_list *p;
+       struct bch_devs_list devs;
+       u64 seq = 0;
+       unsigned iter;
+       int ret = 0;
+
+       spin_lock(&j->lock);
+       fifo_for_each_entry_ptr(p, &j->pin, iter)
+               if (bch2_dev_list_has_dev(p->devs, dev_idx))
+                       seq = journal_pin_seq(j, p);
+       spin_unlock(&j->lock);
+
+       bch2_journal_flush_pins(j, seq);
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+       seq = 0;
+
+       spin_lock(&j->lock);
+       while (!ret && seq < atomic64_read(&j->seq)) {
+               seq = max(seq, last_seq(j));
+               devs = journal_seq_pin(j, seq)->devs;
+               seq++;
+
+               spin_unlock(&j->lock);
+               ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+               spin_lock(&j->lock);
+       }
+       spin_unlock(&j->lock);
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -2862,9 +2935,7 @@ void bch2_fs_journal_stop(struct journal *j)
         * journal entries, then force a brand new empty journal entry to be
         * written:
         */
-       bch2_journal_flush_pins(j, U64_MAX);
-       bch2_journal_flush_async(j, NULL);
-       bch2_journal_meta(j);
+       bch2_journal_flush_all_pins(j);
 
        cancel_delayed_work_sync(&j->write_work);
        cancel_delayed_work_sync(&j->reclaim_work);
index 9d6c79c6c1134fb0e071590f45488e97b4dc614b..5f3ece0899374b9d0dc520c13bface13b80b05bf 100644 (file)
  */
 struct journal_replay {
        struct list_head        list;
+       struct bch_devs_list    devs;
+       /* must be last: */
        struct jset             j;
 };
 
@@ -164,6 +166,7 @@ void bch2_journal_pin_add_if_older(struct journal *,
                                  struct journal_entry_pin *,
                                  journal_pin_flush_fn);
 void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
 
 struct closure;
 struct bch_fs;
@@ -356,6 +359,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
 
 void bch2_journal_halt(struct journal *);
 
index 55b41c56a3f2f276bf474580dbfad43e3eb71bc2..87f378a6ac4fff2358e603931e6110272c78d4f7 100644 (file)
@@ -34,6 +34,7 @@ struct journal_entry_pin_list {
        struct list_head                list;
        struct list_head                flushed;
        atomic_t                        count;
+       struct bch_devs_list            devs;
 };
 
 struct journal;
index 8d1c0ee07c247c2654e56b4732cd592a0d097d83..e11ee953248389a3cb66d60da797e0b031f1f731 100644 (file)
@@ -27,23 +27,9 @@ static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 
 #define MAX_DATA_OFF_ITER      10
 
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented.  The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+                                   int flags)
 {
-       struct bch_fs *c = ca->fs;
        struct btree_iter iter;
        struct bkey_s_c k;
        u64 keys_moved, sectors_moved;
@@ -113,10 +99,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
        return ret;
 }
 
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
 static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
                               enum btree_id id)
 {
@@ -200,9 +182,9 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
  *   is written.
  */
 
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+                                    int flags)
 {
-       struct bch_fs *c = ca->fs;
        unsigned i;
        int ret = 0;
 
@@ -240,37 +222,31 @@ err:
        return ret;
 }
 
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+       return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+               bch2_dev_metadata_migrate(c, ca, flags);
+}
 
-static int bch2_flag_key_bad(struct btree_iter *iter,
-                           struct bch_dev *ca,
-                           struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+                        unsigned dev_idx, int flags, bool metadata)
 {
-       BKEY_PADDED(key) tmp;
-       struct bkey_s_extent e;
        struct bch_extent_ptr *ptr;
-       struct bch_fs *c = ca->fs;
-
-       bkey_reassemble(&tmp.key, orig.s_c);
-       e = bkey_i_to_s_extent(&tmp.key);
+       unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+       unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+       unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+       unsigned nr_good;
 
        extent_for_each_ptr_backwards(e, ptr)
-               if (ptr->dev == ca->dev_idx)
+               if (ptr->dev == dev_idx)
                        bch2_extent_drop_ptr(e, ptr);
 
-       /*
-        * If the new extent no longer has any pointers, bch2_extent_normalize()
-        * will do the appropriate thing with it (turning it into a
-        * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
-        */
-       bch2_extent_normalize(c, e.s);
+       nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+       if ((!nr_good && !(flags & lost)) ||
+           (nr_good < replicas && !(flags & degraded)))
+               return -EINVAL;
 
-       return bch2_btree_insert_at(c, NULL, NULL, NULL,
-                                  BTREE_INSERT_ATOMIC,
-                                  BTREE_INSERT_ENTRY(iter, &tmp.key));
+       return 0;
 }
 
 /*
@@ -284,11 +260,11 @@ static int bch2_flag_key_bad(struct btree_iter *iter,
  * that we've already tried to move the data MAX_DATA_OFF_ITER times and
  * are not likely to succeed if we try again.
  */
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       struct bch_fs *c = ca->fs;
        struct bkey_s_c k;
-       struct bkey_s_c_extent e;
+       struct bkey_s_extent e;
+       BKEY_PADDED(key) tmp;
        struct btree_iter iter;
        int ret = 0;
 
@@ -303,11 +279,33 @@ int bch2_flag_data_bad(struct bch_dev *ca)
                if (!bkey_extent_is_data(k.k))
                        goto advance;
 
-               e = bkey_s_c_to_extent(k);
-               if (!bch2_extent_has_device(e, ca->dev_idx))
+               if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
                        goto advance;
 
-               ret = bch2_flag_key_bad(&iter, ca, e);
+               bkey_reassemble(&tmp.key, k);
+               e = bkey_i_to_s_extent(&tmp.key);
+
+               ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+               if (ret)
+                       break;
+
+               /*
+                * If the new extent no longer has any pointers, bch2_extent_normalize()
+                * will do the appropriate thing with it (turning it into a
+                * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+                */
+               bch2_extent_normalize(c, e.s);
+
+               if (bkey_extent_is_data(e.k) &&
+                   (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+                       break;
+
+               iter.pos = bkey_start_pos(&tmp.key.k);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+                                          BTREE_INSERT_ATOMIC|
+                                          BTREE_INSERT_NOFAIL,
+                                          BTREE_INSERT_ENTRY(&iter, &tmp.key));
 
                /*
                 * don't want to leave ret == -EINTR, since if we raced and
@@ -319,26 +317,6 @@ int bch2_flag_data_bad(struct bch_dev *ca)
                if (ret)
                        break;
 
-               /*
-                * If the replica we're dropping was dirty and there is an
-                * additional cached replica, the cached replica will now be
-                * considered dirty - upon inserting the new version of the key,
-                * the bucket accounting will be updated to reflect the fact
-                * that the cached data is now dirty and everything works out as
-                * if by magic without us having to do anything.
-                *
-                * The one thing we need to be concerned with here is there's a
-                * race between when we drop any stale pointers from the key
-                * we're about to insert, and when the key actually gets
-                * inserted and the cached data is marked as dirty - we could
-                * end up trying to insert a key with a pointer that should be
-                * dirty, but points to stale data.
-                *
-                * If that happens the insert code just bails out and doesn't do
-                * the insert - however, it doesn't return an error. Hence we
-                * need to always recheck the current key before advancing to
-                * the next:
-                */
                continue;
 advance:
                if (bkey_extent_is_data(k.k)) {
@@ -357,3 +335,80 @@ advance:
 
        return ret;
 }
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       struct btree_iter iter;
+       struct closure cl;
+       struct btree *b;
+       unsigned id;
+       int ret;
+
+       /* don't handle this yet: */
+       if (flags & BCH_FORCE_IF_METADATA_LOST)
+               return -EINVAL;
+
+       closure_init_stack(&cl);
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+                       struct bkey_i_extent *new_key;
+retry:
+                       if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+                                                   dev_idx)) {
+                               bch2_btree_iter_set_locks_want(&iter, 0);
+
+                               ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+                                                           BCH_DATA_BTREE);
+                               if (ret)
+                                       goto err;
+                       } else {
+                               bkey_copy(&tmp.k, &b->key);
+                               new_key = bkey_i_to_extent(&tmp.k);
+
+                               ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+                                                   dev_idx, flags, true);
+                               if (ret)
+                                       goto err;
+
+                               if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+                                       b = bch2_btree_iter_peek_node(&iter);
+                                       goto retry;
+                               }
+
+                               ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+                               if (ret == -EINTR) {
+                                       b = bch2_btree_iter_peek_node(&iter);
+                                       goto retry;
+                               }
+                               if (ret)
+                                       goto err;
+                       }
+               }
+               bch2_btree_iter_unlock(&iter);
+
+               /* btree root */
+               mutex_lock(&c->btree_root_lock);
+               mutex_unlock(&c->btree_root_lock);
+       }
+
+       ret = 0;
+out:
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+err:
+       bch2_btree_iter_unlock(&iter);
+       goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+       return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+               bch2_dev_metadata_drop(c, dev_idx, flags);
+}
index 9bdaa79290a1582ac47d82832c5aca638f193454..6db7b9111bf27806474920cba1cabb11ae37bda8 100644 (file)
@@ -1,8 +1,7 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H
 
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
 
 #endif /* _BCACHEFS_MIGRATE_H */
index 5eaf0cf857ef27c3c0feabfe4c8c6a3d2a0f972e..8ce63d665e8ab53deecbfeb9699fddc8d89b1a13 100644 (file)
@@ -3,6 +3,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "inode.h"
 #include "io.h"
 #include "move.h"
 #include "super-io.h"
@@ -206,7 +207,7 @@ static void move_write(struct closure *cl)
 {
        struct moving_io *io = container_of(cl, struct moving_io, cl);
 
-       if (likely(!io->rbio.bio.bi_error)) {
+       if (likely(!io->rbio.bio.bi_status)) {
                bch2_migrate_write_init(&io->write, &io->rbio);
                closure_call(&io->write.op.cl, bch2_write, NULL, cl);
        }
@@ -240,6 +241,7 @@ static int bch2_move_extent(struct bch_fs *c,
                          struct write_point_specifier wp,
                          int btree_insert_flags,
                          int move_device,
+                         struct bch_io_opts opts,
                          struct bkey_s_c k)
 {
        struct extent_pick_ptr pick;
@@ -276,6 +278,7 @@ static int bch2_move_extent(struct bch_fs *c,
                goto err;
        }
 
+       io->rbio.opts = opts;
        bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
        bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
        io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -284,9 +287,13 @@ static int bch2_move_extent(struct bch_fs *c,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       __bch2_write_op_init(&io->write.op, c);
        io->write.btree_insert_flags = btree_insert_flags;
        io->write.move_dev      = move_device;
+
+       bch2_write_op_init(&io->write.op, c);
+       io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+       io->write.op.compression_type =
+               bch2_compression_opt_to_type(opts.compression);
        io->write.op.devs       = devs;
        io->write.op.write_point = wp;
 
@@ -371,9 +378,11 @@ int bch2_move_data(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct moving_context ctxt;
+       struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_iter iter;
        BKEY_PADDED(k) tmp;
        struct bkey_s_c k;
+       u64 cur_inum = U64_MAX;
        int ret = 0;
 
        bch2_move_ctxt_init(&ctxt);
@@ -396,7 +405,7 @@ int bch2_move_data(struct bch_fs *c,
                    (bch2_btree_iter_unlock(&iter),
                     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
                        break;
-
+peek:
                k = bch2_btree_iter_peek(&iter);
                if (!k.k)
                        break;
@@ -404,8 +413,23 @@ int bch2_move_data(struct bch_fs *c,
                if (ret)
                        break;
 
-               if (!bkey_extent_is_data(k.k) ||
-                   !pred(arg, bkey_s_c_to_extent(k)))
+               if (!bkey_extent_is_data(k.k))
+                       goto next;
+
+               if (cur_inum != k.k->p.inode) {
+                       struct bch_inode_unpacked inode;
+
+                       /* don't hold btree locks while looking up inode: */
+                       bch2_btree_iter_unlock(&iter);
+
+                       opts = bch2_opts_to_inode_opts(c->opts);
+                       if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+                               bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+                       cur_inum = k.k->p.inode;
+                       goto peek;
+               }
+
+               if (!pred(arg, bkey_s_c_to_extent(k)))
                        goto next;
 
                /* unlock before doing IO: */
@@ -415,7 +439,7 @@ int bch2_move_data(struct bch_fs *c,
 
                if (bch2_move_extent(c, &ctxt, devs, wp,
                                     btree_insert_flags,
-                                    move_device, k)) {
+                                    move_device, opts, k)) {
                        /* memory allocation failure, wait for some IO to finish */
                        bch2_move_ctxt_wait_for_io(&ctxt);
                        continue;
index c9482151d42543a0e8b064f71b567505edee5209..28e40e41b08e90b22aadd4e0285198d43b8f7f0c 100644 (file)
@@ -76,16 +76,27 @@ void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
 {
        switch (id) {
 #define BCH_OPT(_name, ...)                                            \
        case Opt_##_name:                                               \
-               return opts->_name;                                     \
-
+               return opt_defined(*opts, _name);
        BCH_OPTS()
 #undef BCH_OPT
+       default:
+               BUG();
+       }
+}
 
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+       switch (id) {
+#define BCH_OPT(_name, ...)                                            \
+       case Opt_##_name:                                               \
+               return opts->_name;
+       BCH_OPTS()
+#undef BCH_OPT
        default:
                BUG();
        }
@@ -98,10 +109,8 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
        case Opt_##_name:                                               \
                opt_set(*opts, _name, v);                               \
                break;
-
        BCH_OPTS()
 #undef BCH_OPT
-
        default:
                BUG();
        }
@@ -118,7 +127,6 @@ struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
 #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)         \
        if (_sb_opt != NO_SB_OPT)                                       \
                opt_set(opts, _name, _sb_opt(sb));
-
        BCH_OPTS()
 #undef BCH_OPT
 
@@ -145,7 +153,7 @@ const struct bch_option bch2_opt_table[] = {
 #undef BCH_OPT
 };
 
-static int bch2_opt_lookup(const char *name)
+int bch2_opt_lookup(const char *name)
 {
        const struct bch_option *i;
 
@@ -247,3 +255,52 @@ no_val:
        pr_err("Mount option %s requires a value", name);
        return -1;
 }
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+       struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(ret, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+       struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(ret, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits)                                    \
+       if (opt_defined(src, _name))                                    \
+               opt_set(*dst, _name, src._name);
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+       static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits)    Opt_##_name,
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+       };
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+               if (inode_opt_list[i] == id)
+                       return true;
+
+       return false;
+}
index 33e3a2c89f1b22d7ecb0f3881112c1723f4540b5..126056e605435e65c1d5ad29528a76eae5cca6f2 100644 (file)
@@ -181,10 +181,7 @@ do {                                                                       \
 
 static inline struct bch_opts bch2_opts_empty(void)
 {
-       struct bch_opts opts;
-
-       memset(&opts, 0, sizeof(opts));
-       return opts;
+       return (struct bch_opts) { 0 };
 }
 
 void bch2_opts_apply(struct bch_opts *, struct bch_opts);
@@ -215,12 +212,35 @@ struct bch_option {
 
 extern const struct bch_option bch2_opt_table[];
 
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
 struct bch_opts bch2_opts_from_sb(struct bch_sb *);
 
+int bch2_opt_lookup(const char *);
 int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
+/* inode opts: */
+
+#define BCH_INODE_OPTS()                                       \
+       BCH_INODE_OPT(data_checksum,                    8)      \
+       BCH_INODE_OPT(compression,                      8)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits)    unsigned _name##_defined:1;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits)    u##_bits _name;
+       BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
 #endif /* _BCACHEFS_OPTS_H */
index a3ecfb92c38cf03e996a938b199ab23bd71860ed..3f55c244c6392ba05dfa0af091902579bd673a57 100644 (file)
@@ -12,6 +12,8 @@
 #include <linux/sort.h>
 
 static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+                                           struct bch_replicas_cpu *);
 static const char *bch2_sb_validate_replicas(struct bch_sb *);
 
 static inline void __bch2_sb_layout_size_assert(void)
@@ -157,7 +159,7 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
                return NULL;
 
        f = __bch2_sb_field_resize(sb->sb, f, u64s);
-       f->type = type;
+       f->type = cpu_to_le32(type);
        return f;
 }
 
@@ -188,7 +190,7 @@ struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
        }
 
        f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
-       f->type = type;
+       f->type = cpu_to_le32(type);
        return f;
 }
 
@@ -354,7 +356,16 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 
        if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
            BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
-               return "Invalid number of metadata replicas";
+               return "Invalid number of data replicas";
+
+       if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+               return "Invalid metadata checksum type";
+
+       if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+               return "Invalid metadata checksum type";
+
+       if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+               return "Invalid compression type";
 
        if (!BCH_SB_BTREE_NODE_SIZE(sb))
                return "Btree node size not set";
@@ -507,7 +518,7 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
                if (src_f->type == BCH_SB_FIELD_journal)
                        continue;
 
-               dst_f = bch2_sb_field_get(dst, src_f->type);
+               dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
                dst_f = __bch2_sb_field_resize(dst, dst_f,
                                le32_to_cpu(src_f->u64s));
 
@@ -601,7 +612,7 @@ reread:
 
        /* XXX: verify MACs */
        csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
-                           (struct nonce) { 0 }, sb->sb);
+                           null_nonce(), sb->sb);
 
        if (bch2_crc_cmp(csum, sb->sb->csum))
                return "bad checksum reading superblock";
@@ -688,9 +699,9 @@ const char *bch2_read_super(const char *path,
 got_super:
        pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
                 le64_to_cpu(ret->sb->version),
-                le64_to_cpu(ret->sb->flags),
+                le64_to_cpu(ret->sb->flags[0]),
                 le64_to_cpu(ret->sb->seq),
-                le16_to_cpu(ret->sb->u64s));
+                le32_to_cpu(ret->sb->u64s));
 
        err = "Superblock block size smaller than device block size";
        if (le16_to_cpu(ret->sb->block_size) << 9 <
@@ -711,7 +722,7 @@ static void write_super_endio(struct bio *bio)
 
        /* XXX: return errors directly */
 
-       if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
                ca->sb_write_error = 1;
 
        closure_put(&ca->fs->sb_write);
@@ -727,7 +738,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 
        SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
        sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
-                               (struct nonce) { 0 }, sb);
+                               null_nonce(), sb);
 
        bio_reset(bio);
        bio->bi_bdev            = ca->disk_sb.bdev;
@@ -830,7 +841,12 @@ out:
        bch2_sb_update(c);
 }
 
-/* replica information: */
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)                            \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+            _i = (void *) (_i) + (_r)->entry_size)
 
 static inline struct bch_replicas_cpu_entry *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
@@ -838,6 +854,11 @@ cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
        return (void *) r->entries + r->entry_size * i;
 }
 
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+       eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
 static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
                                     unsigned dev)
 {
@@ -856,6 +877,246 @@ static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
                offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
 }
 
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+                            enum bch_data_type data_type,
+                            struct bch_replicas_cpu_entry *r,
+                            unsigned *max_dev)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned nr = 0;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached) {
+                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+                       replicas_set_dev(r, ptr->dev);
+                       nr++;
+               }
+       return nr;
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+                      struct bch_replicas_cpu_entry new_entry,
+                      unsigned max_dev)
+{
+       struct bch_replicas_cpu *new;
+       unsigned i, nr, entry_size;
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+       entry_size = max(entry_size, old->entry_size);
+       nr = old->nr + 1;
+
+       new = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     nr * entry_size, GFP_NOIO);
+       if (!new)
+               return NULL;
+
+       new->nr         = nr;
+       new->entry_size = entry_size;
+
+       for (i = 0; i < old->nr; i++)
+               memcpy(cpu_replicas_entry(new, i),
+                      cpu_replicas_entry(old, i),
+                      min(new->entry_size, old->entry_size));
+
+       memcpy(cpu_replicas_entry(new, old->nr),
+              &new_entry,
+              new->entry_size);
+
+       bch2_cpu_replicas_sort(new);
+       return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+                               struct bch_replicas_cpu_entry search,
+                               unsigned max_dev)
+{
+       return max_dev < replicas_dev_slots(r) &&
+               eytzinger0_find(r->entries, r->nr,
+                               r->entry_size,
+                               memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+                               struct bch_replicas_cpu_entry new_entry,
+                               unsigned max_dev)
+{
+       struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
+       int ret = -ENOMEM;
+
+       mutex_lock(&c->sb_lock);
+
+       old_gc = rcu_dereference_protected(c->replicas_gc,
+                                          lockdep_is_held(&c->sb_lock));
+       if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+               new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+               if (!new_gc)
+                       goto err;
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+       /* recheck, might have raced */
+       if (replicas_has_entry(old_r, new_entry, max_dev))
+               goto out;
+
+       new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+       if (!new_r)
+               goto err;
+
+       ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+       if (ret)
+               goto err;
+
+       if (new_gc) {
+               rcu_assign_pointer(c->replicas_gc, new_gc);
+               kfree_rcu(old_gc, rcu);
+       }
+
+       rcu_assign_pointer(c->replicas, new_r);
+       kfree_rcu(old_r, rcu);
+
+       bch2_write_super(c);
+out:
+       ret = 0;
+err:
+       mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+                               struct bch_replicas_cpu_entry search,
+                               unsigned max_dev)
+{
+       struct bch_replicas_cpu *r, *gc_r;
+       bool marked;
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+       gc_r = rcu_dereference(c->replicas_gc);
+       marked = replicas_has_entry(r, search, max_dev) &&
+               (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+       rcu_read_unlock();
+
+       return likely(marked) ? 0
+               : bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+                         enum bch_data_type data_type)
+{
+       struct bch_replicas_cpu_entry search;
+       unsigned max_dev;
+
+       if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+               return 0;
+
+       return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+                                 struct bch_devs_list *devs,
+                                 enum bch_data_type data_type)
+{
+       struct bch_replicas_cpu_entry search = { .data_type = data_type };
+       unsigned i, max_dev = 0;
+
+       if (!devs->nr)
+               return 0;
+
+       for (i = 0; i < devs->nr; i++) {
+               max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+               replicas_set_dev(&search, devs->devs[i]);
+       }
+
+       return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+       struct bch_replicas_cpu *new_r, *old_r;
+       int ret = 0;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+
+       new_r = rcu_dereference_protected(c->replicas_gc,
+                                         lockdep_is_held(&c->sb_lock));
+
+       if (err) {
+               rcu_assign_pointer(c->replicas_gc, NULL);
+               kfree_rcu(new_r, rcu);
+               goto err;
+       }
+
+       if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+               ret = -ENOSPC;
+               goto err;
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+
+       rcu_assign_pointer(c->replicas, new_r);
+       rcu_assign_pointer(c->replicas_gc, NULL);
+       kfree_rcu(old_r, rcu);
+
+       bch2_write_super(c);
+err:
+       mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+       struct bch_replicas_cpu *dst, *src;
+       struct bch_replicas_cpu_entry *e;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+       BUG_ON(c->replicas_gc);
+
+       src = rcu_dereference_protected(c->replicas,
+                                       lockdep_is_held(&c->sb_lock));
+
+       dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     src->nr * src->entry_size, GFP_NOIO);
+       if (!dst) {
+               mutex_unlock(&c->sb_lock);
+               return -ENOMEM;
+       }
+
+       dst->nr         = 0;
+       dst->entry_size = src->entry_size;
+
+       for_each_cpu_replicas_entry(src, e)
+               if (!((1 << e->data_type) & typemask))
+                       memcpy(cpu_replicas_entry(dst, dst->nr++),
+                              e, dst->entry_size);
+
+       bch2_cpu_replicas_sort(dst);
+
+       rcu_assign_pointer(c->replicas_gc, dst);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+/* Replicas tracking - superblock: */
+
 static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
                                        unsigned *nr,
                                        unsigned *bytes,
@@ -914,10 +1175,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
                }
        }
 
-       eytzinger0_sort(cpu_r->entries,
-                       cpu_r->nr,
-                       cpu_r->entry_size,
-                       memcmp, NULL);
+       bch2_cpu_replicas_sort(cpu_r);
        return cpu_r;
 }
 
@@ -926,14 +1184,12 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
        struct bch_sb_field_replicas *sb_r;
        struct bch_replicas_cpu *cpu_r, *old_r;
 
-       lockdep_assert_held(&c->sb_lock);
-
        sb_r    = bch2_sb_get_replicas(c->disk_sb);
        cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
        if (!cpu_r)
                return -ENOMEM;
 
-       old_r = c->replicas;
+       old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
        rcu_assign_pointer(c->replicas, cpu_r);
        if (old_r)
                kfree_rcu(old_r, rcu);
@@ -941,192 +1197,133 @@ static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
        return 0;
 }
 
-static void bkey_to_replicas(struct bkey_s_c_extent e,
-                            enum bch_data_type data_type,
-                            struct bch_replicas_cpu_entry *r,
-                            unsigned *max_dev)
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+                                           struct bch_replicas_cpu *r)
 {
-       const struct bch_extent_ptr *ptr;
-
-       BUG_ON(!data_type ||
-              data_type == BCH_DATA_SB ||
-              data_type >= BCH_DATA_NR);
-
-       memset(r, 0, sizeof(*r));
-       r->data_type = data_type;
-
-       *max_dev = 0;
-
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached) {
-                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
-                       replicas_set_dev(r, ptr->dev);
-               }
-}
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_entry *sb_e;
+       struct bch_replicas_cpu_entry *e;
+       size_t i, bytes;
 
-/*
- * for when gc of replica information is in progress:
- */
-static int bch2_update_gc_replicas(struct bch_fs *c,
-                                  struct bch_replicas_cpu *gc_r,
-                                  struct bkey_s_c_extent e,
-                                  enum bch_data_type data_type)
-{
-       struct bch_replicas_cpu_entry new_e;
-       struct bch_replicas_cpu *new;
-       unsigned i, nr, entry_size, max_dev;
+       bytes = sizeof(struct bch_sb_field_replicas);
 
-       bkey_to_replicas(e, data_type, &new_e, &max_dev);
+       for_each_cpu_replicas_entry(r, e) {
+               bytes += sizeof(struct bch_replicas_entry);
+               for (i = 0; i < r->entry_size - 1; i++)
+                       bytes += hweight8(e->devs[i]);
+       }
 
-       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-               DIV_ROUND_UP(max_dev + 1, 8);
-       entry_size = max(entry_size, gc_r->entry_size);
-       nr = gc_r->nr + 1;
+       sb_r = bch2_fs_sb_resize_replicas(c,
+                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+       if (!sb_r)
+               return -ENOSPC;
 
-       new = kzalloc(sizeof(struct bch_replicas_cpu) +
-                     nr * entry_size, GFP_NOIO);
-       if (!new)
-               return -ENOMEM;
+       memset(&sb_r->entries, 0,
+              vstruct_end(&sb_r->field) -
+              (void *) &sb_r->entries);
 
-       new->nr         = nr;
-       new->entry_size = entry_size;
+       sb_e = sb_r->entries;
+       for_each_cpu_replicas_entry(r, e) {
+               sb_e->data_type = e->data_type;
 
-       for (i = 0; i < gc_r->nr; i++)
-               memcpy(cpu_replicas_entry(new, i),
-                      cpu_replicas_entry(gc_r, i),
-                      gc_r->entry_size);
+               for (i = 0; i < replicas_dev_slots(r); i++)
+                       if (replicas_test_dev(e, i))
+                               sb_e->devs[sb_e->nr++] = i;
 
-       memcpy(cpu_replicas_entry(new, nr - 1),
-              &new_e,
-              new->entry_size);
+               sb_e = replicas_entry_next(sb_e);
 
-       eytzinger0_sort(new->entries,
-                       new->nr,
-                       new->entry_size,
-                       memcmp, NULL);
+               BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+       }
 
-       rcu_assign_pointer(c->replicas_gc, new);
-       kfree_rcu(gc_r, rcu);
        return 0;
 }
 
-static bool replicas_has_extent(struct bch_replicas_cpu *r,
-                               struct bkey_s_c_extent e,
-                               enum bch_data_type data_type)
-{
-       struct bch_replicas_cpu_entry search;
-       unsigned max_dev;
-
-       bkey_to_replicas(e, data_type, &search, &max_dev);
-
-       return max_dev < replicas_dev_slots(r) &&
-               eytzinger0_find(r->entries, r->nr,
-                               r->entry_size,
-                               memcmp, &search) < r->nr;
-}
-
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
-                         enum bch_data_type data_type)
-{
-       bool ret;
-
-       rcu_read_lock();
-       ret = replicas_has_extent(rcu_dereference(c->replicas),
-                                 e, data_type);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
-                                         struct bkey_s_c_extent e,
-                                         enum bch_data_type data_type)
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
 {
-       struct bch_replicas_cpu *gc_r;
-       const struct bch_extent_ptr *ptr;
+       struct bch_sb_field_members *mi;
        struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_entry *new_entry;
-       unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
-       int ret = 0;
+       struct bch_replicas_cpu *cpu_r = NULL;
+       struct bch_replicas_entry *e;
+       const char *err;
+       unsigned i;
 
-       mutex_lock(&c->sb_lock);
+       mi      = bch2_sb_get_members(sb);
+       sb_r    = bch2_sb_get_replicas(sb);
+       if (!sb_r)
+               return NULL;
 
-       gc_r = rcu_dereference_protected(c->replicas_gc,
-                                        lockdep_is_held(&c->sb_lock));
-       if (gc_r &&
-           !replicas_has_extent(gc_r, e, data_type)) {
-               ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
-               if (ret)
+       for_each_replicas_entry(sb_r, e) {
+               err = "invalid replicas entry: invalid data type";
+               if (e->data_type >= BCH_DATA_NR)
                        goto err;
-       }
-
-       /* recheck, might have raced */
-       if (bch2_sb_has_replicas(c, e, data_type)) {
-               mutex_unlock(&c->sb_lock);
-               return 0;
-       }
 
-       new_entry_bytes = sizeof(struct bch_replicas_entry) +
-               bch2_extent_nr_dirty_ptrs(e.s_c);
-
-       sb_r = bch2_sb_get_replicas(c->disk_sb);
+               err = "invalid replicas entry: no devices";
+               if (!e->nr)
+                       goto err;
 
-       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+               err = "invalid replicas entry: too many devices";
+               if (e->nr >= BCH_REPLICAS_MAX)
+                       goto err;
 
-       new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+               err = "invalid replicas entry: invalid device";
+               for (i = 0; i < e->nr; i++)
+                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
+                               goto err;
+       }
 
-       sb_r = bch2_fs_sb_resize_replicas(c,
-                       DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
-                                    sizeof(u64)));
-       if (!sb_r) {
-               ret = -ENOSPC;
+       err = "cannot allocate memory";
+       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
                goto err;
-       }
 
-       new_entry = (void *) sb_r + bytes;
-       new_entry->data_type = data_type;
-       new_entry->nr = 0;
+       sort_cmp_size(cpu_r->entries,
+                     cpu_r->nr,
+                     cpu_r->entry_size,
+                     memcmp, NULL);
+
+       for (i = 0; i + 1 < cpu_r->nr; i++) {
+               struct bch_replicas_cpu_entry *l =
+                       cpu_replicas_entry(cpu_r, i);
+               struct bch_replicas_cpu_entry *r =
+                       cpu_replicas_entry(cpu_r, i + 1);
 
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached)
-                       new_entry->devs[new_entry->nr++] = ptr->dev;
+               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
 
-       ret = bch2_sb_replicas_to_cpu_replicas(c);
-       if (ret) {
-               memset(new_entry, 0,
-                      vstruct_end(&sb_r->field) - (void *) new_entry);
-               goto err;
+               err = "duplicate replicas entry";
+               if (!memcmp(l, r, cpu_r->entry_size))
+                       goto err;
        }
 
-       bch2_write_super(c);
+       err = NULL;
 err:
-       mutex_unlock(&c->sb_lock);
-       return ret;
+       kfree(cpu_r);
+       return err;
 }
 
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+/* Query replicas: */
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
                          enum bch_data_type data_type)
 {
-       struct bch_replicas_cpu *gc_r;
-       bool marked;
+       struct bch_replicas_cpu_entry search;
+       unsigned max_dev;
+       bool ret;
+
+       if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+               return true;
 
        rcu_read_lock();
-       marked = replicas_has_extent(rcu_dereference(c->replicas),
-                                    e, data_type) &&
-               (!(gc_r = rcu_dereference(c->replicas_gc)) ||
-                replicas_has_extent(gc_r, e, data_type));
+       ret = replicas_has_entry(rcu_dereference(c->replicas),
+                                search, max_dev);
        rcu_read_unlock();
 
-       if (marked)
-               return 0;
-
-       return bch2_check_mark_super_slowpath(c, e, data_type);
+       return ret;
 }
 
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-                                       struct bch_devs_mask online_devs)
+                                             struct bch_devs_mask online_devs)
 {
+       struct bch_sb_field_members *mi;
        struct bch_replicas_cpu_entry *e;
        struct bch_replicas_cpu *r;
        unsigned i, dev, dev_slots, nr_online, nr_offline;
@@ -1137,14 +1334,15 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
        for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
                ret.replicas[i].nr_online = UINT_MAX;
 
+       mi = bch2_sb_get_members(c->disk_sb);
        rcu_read_lock();
-       r = rcu_dereference(c->replicas);
-       dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
 
-       for (i = 0; i < r->nr; i++) {
-               e = cpu_replicas_entry(r, i);
+       r = rcu_dereference(c->replicas);
+       dev_slots = replicas_dev_slots(r);
 
-               BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+       for_each_cpu_replicas_entry(r, e) {
+               if (e->data_type >= ARRAY_SIZE(ret.replicas))
+                       panic("e %p data_type %u\n", e, e->data_type);
 
                nr_online = nr_offline = 0;
 
@@ -1152,6 +1350,8 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
                        if (!replicas_test_dev(e, dev))
                                continue;
 
+                       BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
+
                        if (test_bit(dev, online_devs.d))
                                nr_online++;
                        else
@@ -1216,7 +1416,7 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bch_replicas_cpu_entry *e;
        struct bch_replicas_cpu *r;
-       unsigned i, ret = 0;
+       unsigned ret = 0;
 
        rcu_read_lock();
        r = rcu_dereference(c->replicas);
@@ -1224,191 +1424,13 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
        if (ca->dev_idx >= replicas_dev_slots(r))
                goto out;
 
-       for (i = 0; i < r->nr; i++) {
-               e = cpu_replicas_entry(r, i);
-
+       for_each_cpu_replicas_entry(r, e)
                if (replicas_test_dev(e, ca->dev_idx)) {
                        ret |= 1 << e->data_type;
                        break;
                }
-       }
 out:
        rcu_read_unlock();
 
        return ret;
 }
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
-{
-       struct bch_sb_field_members *mi;
-       struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_cpu *cpu_r = NULL;
-       struct bch_replicas_entry *e;
-       const char *err;
-       unsigned i;
-
-       mi      = bch2_sb_get_members(sb);
-       sb_r    = bch2_sb_get_replicas(sb);
-       if (!sb_r)
-               return NULL;
-
-       for_each_replicas_entry(sb_r, e) {
-               err = "invalid replicas entry: invalid data type";
-               if (e->data_type >= BCH_DATA_NR)
-                       goto err;
-
-               err = "invalid replicas entry: too many devices";
-               if (e->nr >= BCH_REPLICAS_MAX)
-                       goto err;
-
-               err = "invalid replicas entry: invalid device";
-               for (i = 0; i < e->nr; i++)
-                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
-                               goto err;
-       }
-
-       err = "cannot allocate memory";
-       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-       if (!cpu_r)
-               goto err;
-
-       sort_cmp_size(cpu_r->entries,
-                     cpu_r->nr,
-                     cpu_r->entry_size,
-                     memcmp, NULL);
-
-       for (i = 0; i + 1 < cpu_r->nr; i++) {
-               struct bch_replicas_cpu_entry *l =
-                       cpu_replicas_entry(cpu_r, i);
-               struct bch_replicas_cpu_entry *r =
-                       cpu_replicas_entry(cpu_r, i + 1);
-
-               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
-               err = "duplicate replicas entry";
-               if (!memcmp(l, r, cpu_r->entry_size))
-                       goto err;
-       }
-
-       err = NULL;
-err:
-       kfree(cpu_r);
-       return err;
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
-       struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_cpu *r, *old_r;
-       struct bch_replicas_entry *dst_e;
-       size_t i, j, bytes, dev_slots;
-       int ret = 0;
-
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-
-       r = rcu_dereference_protected(c->replicas_gc,
-                                     lockdep_is_held(&c->sb_lock));
-
-       if (err) {
-               rcu_assign_pointer(c->replicas_gc, NULL);
-               kfree_rcu(r, rcu);
-               goto err;
-       }
-
-       dev_slots = replicas_dev_slots(r);
-
-       bytes = sizeof(struct bch_sb_field_replicas);
-
-       for (i = 0; i < r->nr; i++) {
-               struct bch_replicas_cpu_entry *e =
-                       cpu_replicas_entry(r, i);
-
-               bytes += sizeof(struct bch_replicas_entry);
-               for (j = 0; j < r->entry_size - 1; j++)
-                       bytes += hweight8(e->devs[j]);
-       }
-
-       sb_r = bch2_fs_sb_resize_replicas(c,
-                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
-       if (!sb_r) {
-               ret = -ENOSPC;
-               goto err;
-       }
-
-       memset(&sb_r->entries, 0,
-              vstruct_end(&sb_r->field) -
-              (void *) &sb_r->entries);
-
-       dst_e = sb_r->entries;
-       for (i = 0; i < r->nr; i++) {
-               struct bch_replicas_cpu_entry *src_e =
-                       cpu_replicas_entry(r, i);
-
-               dst_e->data_type = src_e->data_type;
-
-               for (j = 0; j < dev_slots; j++)
-                       if (replicas_test_dev(src_e, j))
-                               dst_e->devs[dst_e->nr++] = j;
-
-               dst_e = replicas_entry_next(dst_e);
-       }
-
-       old_r = rcu_dereference_protected(c->replicas,
-                                         lockdep_is_held(&c->sb_lock));
-       rcu_assign_pointer(c->replicas, r);
-       rcu_assign_pointer(c->replicas_gc, NULL);
-       kfree_rcu(old_r, rcu);
-
-       bch2_write_super(c);
-err:
-       mutex_unlock(&c->sb_lock);
-       return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-       struct bch_replicas_cpu *r, *src;
-       unsigned i;
-
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-       BUG_ON(c->replicas_gc);
-
-       src = rcu_dereference_protected(c->replicas,
-                                       lockdep_is_held(&c->sb_lock));
-
-       r = kzalloc(sizeof(struct bch_replicas_cpu) +
-                   src->nr * src->entry_size, GFP_NOIO);
-       if (!r) {
-               mutex_unlock(&c->sb_lock);
-               return -ENOMEM;
-       }
-
-       r->entry_size = src->entry_size;
-       r->nr = 0;
-
-       for (i = 0; i < src->nr; i++) {
-               struct bch_replicas_cpu_entry *dst_e =
-                       cpu_replicas_entry(r, r->nr);
-               struct bch_replicas_cpu_entry *src_e =
-                       cpu_replicas_entry(src, i);
-
-               if (!(src_e->data_type & typemask)) {
-                       memcpy(dst_e, src_e, r->entry_size);
-                       r->nr++;
-               }
-       }
-
-       eytzinger0_sort(r->entries,
-                       r->nr,
-                       r->entry_size,
-                       memcmp, NULL);
-
-       rcu_assign_pointer(c->replicas_gc, r);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
index 8cafb30181fd5579738c8062f14427b34e6a519a..4096efb22d269dc55d3f5dbe95dc0aa2e46765d6 100644 (file)
@@ -125,23 +125,12 @@ void bch2_write_super(struct bch_fs *);
 
 /* replicas: */
 
-/* iterate over bch_sb_field_replicas: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)                                        \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-            (_i) = replicas_entry_next(_i))
-
 bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
                          enum bch_data_type);
 int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
                          enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+                                 enum bch_data_type);
 
 struct replicas_status {
        struct {
@@ -161,4 +150,17 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)                                        \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+            (_i) = replicas_entry_next(_i))
+
 #endif /* _BCACHEFS_SUPER_IO_H */
index 4e8b0a5156e5803b5c008224e48fa540396c9e4b..60a2d83ed8c109a8e214ec85fcfe5d5b88c79498 100644 (file)
@@ -140,8 +140,9 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
        return c;
 }
 
-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
 {
+       struct bch_fs *c = data;
        struct backing_dev_info *bdi;
        struct bch_dev *ca;
        unsigned i;
@@ -178,13 +179,6 @@ int bch2_congested(struct bch_fs *c, int bdi_bits)
        return ret;
 }
 
-static int bch2_congested_fn(void *data, int bdi_bits)
-{
-       struct bch_fs *c = data;
-
-       return bch2_congested(c, bdi_bits);
-}
-
 /* Filesystem RO/RW: */
 
 /*
@@ -218,7 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         * Flush journal before stopping allocators, because flushing journal
         * blacklist entries involves allocating new btree nodes:
         */
-       bch2_journal_flush_pins(&c->journal, U64_MAX);
+       bch2_journal_flush_all_pins(&c->journal);
 
        if (!bch2_journal_error(&c->journal))
                bch2_btree_verify_flushed(c);
@@ -379,8 +373,6 @@ static void bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[WRITE]);
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
-       if (c->bdi.bdi_list.next)
-               bdi_destroy(&c->bdi);
        lg_lock_free(&c->usage_lock);
        free_percpu(c->usage_percpu);
        mempool_exit(&c->btree_bounce_pool);
@@ -393,7 +385,7 @@ static void bch2_fs_free(struct bch_fs *c)
        mempool_exit(&c->btree_reserve_pool);
        mempool_exit(&c->fill_iter);
        percpu_ref_exit(&c->writes);
-       kfree(c->replicas);
+       kfree(rcu_dereference_protected(c->replicas, 1));
 
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
@@ -414,7 +406,7 @@ static void bch2_fs_exit(struct bch_fs *c)
 
        for (i = 0; i < c->sb.nr_devices; i++)
                if (c->devs[i])
-                       bch2_dev_free(c->devs[i]);
+                       bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
 
        closure_debug_destroy(&c->cl);
        kobject_put(&c->kobj);
@@ -576,10 +568,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                                      sizeof(struct btree_update)) ||
            mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
            bioset_init(&c->btree_read_bio, 1,
-                       offsetof(struct btree_read_bio, bio)) ||
-           bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
-           bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
-           bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+                       offsetof(struct btree_read_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS) ||
+           bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS) ||
            mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
                                         c->opts.btree_node_size,
@@ -588,7 +584,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
            lg_lock_init(&c->usage_lock) ||
            mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
-           bdi_setup_and_register(&c->bdi, "bcachefs") ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
@@ -599,10 +594,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_fsio_init(c))
                goto err;
 
-       c->bdi.ra_pages         = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-       c->bdi.congested_fn     = bch2_congested_fn;
-       c->bdi.congested_data   = c;
-
        mi = bch2_sb_get_members(c->disk_sb);
        for (i = 0; i < c->sb.nr_devices; i++)
                if (bch2_dev_exists(c->disk_sb, mi, i) &&
@@ -729,8 +720,12 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                                continue;
 
                        err = "error reading btree root";
-                       if (bch2_btree_root_read(c, i, k, level))
-                               goto err;
+                       if (bch2_btree_root_read(c, i, k, level)) {
+                               if (i != BTREE_ID_ALLOC)
+                                       goto err;
+
+                               mustfix_fsck_err(c, "error reading btree root");
+                       }
                }
 
                err = "error reading allocation information";
@@ -830,7 +825,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                closure_sync(&cl);
 
                bch2_inode_init(c, &inode, 0, 0,
-                              S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+                              S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
                inode.bi_inum = BCACHEFS_ROOT_INO;
 
                bch2_inode_pack(&packed_inode, &inode);
@@ -877,6 +872,7 @@ out:
        bch2_journal_entries_free(&journal);
        return err;
 err:
+fsck_err:
        closure_sync(&cl);
 
        switch (ret) {
@@ -995,24 +991,20 @@ static void bch2_dev_free(struct bch_dev *ca)
        kobject_put(&ca->kobj);
 }
 
-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
-       struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
-       complete(&ca->offline_complete);
-}
-
 static void __bch2_dev_offline(struct bch_dev *ca)
 {
        struct bch_fs *c = ca->fs;
 
        lockdep_assert_held(&c->state_lock);
 
+       if (percpu_ref_is_zero(&ca->io_ref))
+               return;
+
        __bch2_dev_read_only(c, ca);
 
-       reinit_completion(&ca->offline_complete);
+       reinit_completion(&ca->io_ref_completion);
        percpu_ref_kill(&ca->io_ref);
-       wait_for_completion(&ca->offline_complete);
+       wait_for_completion(&ca->io_ref_completion);
 
        if (ca->kobj.state_in_sysfs) {
                struct kobject *block =
@@ -1026,27 +1018,18 @@ static void __bch2_dev_offline(struct bch_dev *ca)
        bch2_dev_journal_exit(ca);
 }
 
-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
 {
        struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
 
-       complete(&ca->stop_complete);
+       complete(&ca->ref_completion);
 }
 
-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
 {
-       struct bch_fs *c = ca->fs;
-
-       lockdep_assert_held(&c->state_lock);
-
-       BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
-       rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
-       synchronize_rcu();
+       struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
 
-       reinit_completion(&ca->stop_complete);
-       percpu_ref_kill(&ca->ref);
-       wait_for_completion(&ca->stop_complete);
+       complete(&ca->io_ref_completion);
 }
 
 static int bch2_dev_sysfs_online(struct bch_dev *ca)
@@ -1095,8 +1078,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
                return -ENOMEM;
 
        kobject_init(&ca->kobj, &bch2_dev_ktype);
-       init_completion(&ca->stop_complete);
-       init_completion(&ca->offline_complete);
+       init_completion(&ca->ref_completion);
+       init_completion(&ca->io_ref_completion);
 
        ca->dev_idx = dev_idx;
        __set_bit(ca->dev_idx, ca->self.d);
@@ -1132,9 +1115,9 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
                DIV_ROUND_UP(BTREE_NODE_RESERVE,
                             ca->mi.bucket_size / c->opts.btree_node_size);
 
-       if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+       if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
                            0, GFP_KERNEL) ||
-           percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+           percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
            !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
                       GFP_KERNEL) ||
@@ -1155,7 +1138,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
                                            GFP_KERNEL|__GFP_ZERO)) ||
            !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
            bioset_init(&ca->replica_set, 4,
-                       offsetof(struct bch_write_bio, bio)) ||
+                       offsetof(struct bch_write_bio, bio), 0) ||
            !(ca->io_done       = alloc_percpu(*ca->io_done)))
                goto err;
 
@@ -1180,8 +1163,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
        struct bch_dev *ca;
        int ret;
 
-       lockdep_assert_held(&c->sb_lock);
-
        if (le64_to_cpu(sb->sb->seq) >
            le64_to_cpu(c->disk_sb->seq))
                bch2_sb_to_fs(c, sb->sb);
@@ -1189,13 +1170,15 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
        BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
               !c->devs[sb->sb->dev_idx]);
 
-       ca = c->devs[sb->sb->dev_idx];
+       ca = bch_dev_locked(c, sb->sb->dev_idx);
        if (ca->disk_sb.bdev) {
                bch_err(c, "already have device online in slot %u",
                        sb->sb->dev_idx);
                return -EINVAL;
        }
 
+       BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
        ret = bch2_dev_journal_init(ca, sb->sb);
        if (ret)
                return ret;
@@ -1222,7 +1205,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
        if (bch2_dev_sysfs_online(ca))
                pr_warn("error creating sysfs objects");
 
-       bch2_mark_dev_superblock(c, ca, 0);
+       bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
 
        if (ca->mi.state == BCH_MEMBER_STATE_RW)
                bch2_dev_allocator_add(c, ca);
@@ -1293,6 +1276,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 {
        struct replicas_status s;
        struct bch_sb_field_members *mi;
+       struct bch_dev *ca;
        unsigned i, flags = c->opts.degraded
                ? BCH_FORCE_IF_DEGRADED
                : 0;
@@ -1301,14 +1285,19 @@ static bool bch2_fs_may_start(struct bch_fs *c)
                mutex_lock(&c->sb_lock);
                mi = bch2_sb_get_members(c->disk_sb);
 
-               for (i = 0; i < c->disk_sb->nr_devices; i++)
-                       if (bch2_dev_exists(c->disk_sb, mi, i) &&
-                           !bch2_dev_is_online(c->devs[i]) &&
-                           (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
-                            c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+               for (i = 0; i < c->disk_sb->nr_devices; i++) {
+                       if (!bch2_dev_exists(c->disk_sb, mi, i))
+                               continue;
+
+                       ca = bch_dev_locked(c, i);
+
+                       if (!bch2_dev_is_online(ca) &&
+                           (ca->mi.state == BCH_MEMBER_STATE_RW ||
+                            ca->mi.state == BCH_MEMBER_STATE_RO)) {
                                mutex_unlock(&c->sb_lock);
                                return false;
                        }
+               }
                mutex_unlock(&c->sb_lock);
        }
 
@@ -1419,22 +1408,59 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
         *
         * flag_data_bad() does not check btree pointers
         */
-       ret = bch2_flag_data_bad(ca);
+       ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
        if (ret) {
-               bch_err(ca, "Remove failed");
+               bch_err(ca, "Remove failed: error %i dropping data", ret);
+               goto err;
+       }
+
+       ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+       if (ret) {
+               bch_err(ca, "Remove failed: error %i flushing journal", ret);
                goto err;
        }
 
        data = bch2_dev_has_data(c, ca);
        if (data) {
-               bch_err(ca, "Remove failed, still has data (%x)", data);
+               char data_has_str[100];
+               bch2_scnprint_flag_list(data_has_str,
+                                       sizeof(data_has_str),
+                                       bch2_data_types,
+                                       data);
+               bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+               ret = -EBUSY;
                goto err;
        }
 
-       bch2_journal_meta(&c->journal);
+       ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+                                     POS(ca->dev_idx, 0),
+                                     POS(ca->dev_idx + 1, 0),
+                                     ZERO_VERSION,
+                                     NULL, NULL, NULL);
+       if (ret) {
+               bch_err(ca, "Remove failed, error deleting alloc info");
+               goto err;
+       }
+
+       /*
+        * must flush all existing journal entries, they might have
+        * (overwritten) keys that point to the device we're removing:
+        */
+       ret = bch2_journal_flush_all_pins(&c->journal);
+       if (ret) {
+               bch_err(ca, "Remove failed, journal error");
+               goto err;
+       }
 
        __bch2_dev_offline(ca);
-       bch2_dev_stop(ca);
+
+       mutex_lock(&c->sb_lock);
+       rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+       mutex_unlock(&c->sb_lock);
+
+       percpu_ref_kill(&ca->ref);
+       wait_for_completion(&ca->ref_completion);
+
        bch2_dev_free(ca);
 
        /*
@@ -1542,7 +1568,7 @@ have_slot:
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       ca = c->devs[dev_idx];
+       ca = bch_dev_locked(c, dev_idx);
        if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                err = "journal alloc failed";
                if (bch2_dev_journal_alloc(ca))
@@ -1568,7 +1594,7 @@ err:
 /* Hot add existing device to running filesystem: */
 int bch2_dev_online(struct bch_fs *c, const char *path)
 {
-       struct bch_sb_handle sb = { 0 };
+       struct bch_sb_handle sb = { NULL };
        struct bch_dev *ca;
        unsigned dev_idx;
        const char *err;
@@ -1593,7 +1619,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
        }
        mutex_unlock(&c->sb_lock);
 
-       ca = c->devs[dev_idx];
+       ca = bch_dev_locked(c, dev_idx);
        if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                err = __bch2_dev_read_write(c, ca);
                if (err)
@@ -1619,7 +1645,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
                return -EINVAL;
        }
 
-       __bch2_dev_read_only(c, ca);
        __bch2_dev_offline(ca);
 
        mutex_unlock(&c->state_lock);
@@ -1629,37 +1654,31 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
 {
        unsigned data;
-       int ret;
+       int ret = 0;
 
        mutex_lock(&c->state_lock);
 
        if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                bch_err(ca, "Cannot migrate data off RW device");
-               mutex_unlock(&c->state_lock);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto err;
        }
 
-       mutex_unlock(&c->state_lock);
-
-       ret = bch2_move_data_off_device(ca);
+       ret = bch2_dev_data_migrate(c, ca, 0);
        if (ret) {
                bch_err(ca, "Error migrating data: %i", ret);
-               return ret;
-       }
-
-       ret = bch2_move_metadata_off_device(ca);
-       if (ret) {
-               bch_err(ca, "Error migrating metadata: %i", ret);
-               return ret;
+               goto err;
        }
 
        data = bch2_dev_has_data(c, ca);
        if (data) {
                bch_err(ca, "Migrate error: data still present (%x)", data);
-               return -EINVAL;
+               ret = -EINVAL;
+               goto err;
        }
-
-       return 0;
+err:
+       mutex_unlock(&c->state_lock);
+       return ret;
 }
 
 /* Filesystem open: */
index eb1d2f3d08c6c0e94e026b0d70ad78ae071800cf..7ebe5981bf45a9acd2ee27285df82639ad90c7c0 100644 (file)
@@ -59,6 +59,14 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
                }
 }
 
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+                                        unsigned dev)
+{
+       BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+       BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+       devs->devs[devs->nr++] = dev;
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
                                              struct bch_devs_mask *mask)
 {
@@ -131,6 +139,26 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
        __for_each_online_member(ca, c, iter,                           \
                (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
 
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_protected(c->devs[idx],
+                                        lockdep_is_held(&c->sb_lock) ||
+                                        lockdep_is_held(&c->state_lock));
+}
+
 /* XXX kill, move to struct bch_fs */
 static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 {
@@ -146,7 +174,7 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
 
 struct bch_fs *bch2_bdev_to_fs(struct block_device *);
 struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(struct bch_fs *, int);
+int bch2_congested(void *, int);
 
 bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
                           enum bch_member_state, int);
index 35f1e561c3f5eabf4ecee241feb68f167f10c0b5..3197a2e46166709b7181fcec95760f5dbbcc4aff 100644 (file)
@@ -739,7 +739,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                c->open_buckets_wait.list.first         ? "waiting" : "empty");
 }
 
-const char * const bch2_rw[] = {
+static const char * const bch2_rw[] = {
        "read",
        "write",
        NULL
index 2e29f7414c50e2087bb3316745efd2aaf36e0df4..f5007864c6b609ce38480b89e65ad880af4227c4 100644 (file)
@@ -6,7 +6,6 @@
 #include "clock.h"
 #include "extents.h"
 #include "io.h"
-#include "keylist.h"
 #include "move.h"
 #include "super-io.h"
 #include "tier.h"
@@ -28,7 +27,7 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
                return false;
 
        extent_for_each_ptr(e, ptr)
-               if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+               if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
                        replicas++;
 
        return replicas < c->opts.data_replicas;
index a251bf9c104535a9f074030192d716580bc13b35..6e97e83184e17d0a4e4a8c617888152c20395686 100644 (file)
@@ -34,8 +34,12 @@ struct closure;
 #define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
 #define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
 
-#define memcpy(_dst, _src, _len)                                       \
+#define memcpy(dst, src, len)                                          \
 ({                                                                     \
+       void *_dst = (dst);                                             \
+       const void *_src = (src);                                       \
+       size_t _len = (len);                                            \
+                                                                       \
        BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) ||         \
                 (void *) (_dst) + (_len) <= (void *) (_src)));         \
        memcpy(_dst, _src, _len);                                       \
index ce2cece0d0cb8672b9953659625a18410e80e973..795664428876e6d6ab3d1d0c3a601c40cfc179c7 100644 (file)
@@ -9,10 +9,10 @@
  */
 #define __vstruct_u64s(_s)                                             \
 ({                                                                     \
-       ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s)            \
-       : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s)            \
-       : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s)            \
-       : ((_s)->u64s));                                                \
+       ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s)           \
+       : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s)           \
+       : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s)           \
+       : ((__force u8) ((_s)->u64s)));                                         \
 })
 
 #define __vstruct_bytes(_type, _u64s)                                  \
index 3a49d72817545161ac34a8cdf193afa8f80b53e2..1d6cbe72e05b56e1ec83e15c754b00fb07e6d0c1 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "compress.h"
 #include "extents.h"
 #include "fs.h"
 #include "str_hash.h"
@@ -358,25 +359,139 @@ static const struct xattr_handler bch_xattr_security_handler = {
        .flags  = BCH_XATTR_INDEX_SECURITY,
 };
 
-static const struct xattr_handler *bch_xattr_handler_map[] = {
-       [BCH_XATTR_INDEX_USER]                  = &bch_xattr_user_handler,
-       [BCH_XATTR_INDEX_POSIX_ACL_ACCESS]      =
-               &posix_acl_access_xattr_handler,
-       [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]     =
-               &posix_acl_default_xattr_handler,
-       [BCH_XATTR_INDEX_TRUSTED]               = &bch_xattr_trusted_handler,
-       [BCH_XATTR_INDEX_SECURITY]              = &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+                                  struct dentry *dentry, struct inode *vinode,
+                                  const char *name, void *buffer, size_t size)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_opts opts =
+               bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+       const struct bch_option *opt;
+       int ret, id;
+       u64 v;
+
+       id = bch2_opt_lookup(name);
+       if (id < 0 || !bch2_opt_is_inode_opt(id))
+               return -EINVAL;
+
+       opt = bch2_opt_table + id;
+
+       if (!bch2_opt_defined_by_id(&opts, id))
+               return -ENODATA;
+
+       v = bch2_opt_get_by_id(&opts, id);
+
+       if (opt->type == BCH_OPT_STR)
+               ret = snprintf(buffer, size, "%s", opt->choices[v]);
+       else
+               ret = snprintf(buffer, size, "%llu", v);
+
+       return ret <= size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+       int                     id;
+       u64                     v;
+       bool                    defined;
 };
 
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+                           struct bch_inode_unpacked *bi,
+                           void *p)
+{
+       struct inode_opt_set *s = p;
+
+       if (s->defined)
+               bch2_inode_opt_set(bi, s->id, s->v);
+       else
+               bch2_inode_opt_clear(bi, s->id);
+       return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+                                  struct dentry *dentry, struct inode *vinode,
+                                  const char *name, const void *value,
+                                  size_t size, int flags)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       const struct bch_option *opt;
+       char *buf;
+       struct inode_opt_set s;
+       int ret;
+
+       s.id = bch2_opt_lookup(name);
+       if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+               return -EINVAL;
+
+       opt = bch2_opt_table + s.id;
+
+       if (value) {
+               buf = kmalloc(size + 1, GFP_KERNEL);
+               if (!buf)
+                       return -ENOMEM;
+               memcpy(buf, value, size);
+               buf[size] = '\0';
+
+               ret = bch2_opt_parse(opt, buf, &s.v);
+               kfree(buf);
+
+               if (ret < 0)
+                       return ret;
+
+               if (s.id == Opt_compression) {
+                       mutex_lock(&c->sb_lock);
+                       ret = bch2_check_set_has_compressed_data(c, s.v);
+                       mutex_unlock(&c->sb_lock);
+
+                       if (ret)
+                               return ret;
+               }
+
+               s.defined = true;
+       } else {
+               s.defined = false;
+       }
+
+       mutex_lock(&inode->ei_update_lock);
+       ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+       mutex_unlock(&inode->ei_update_lock);
+
+       return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+       .prefix = "bcachefs.",
+       .get    = bch2_xattr_bcachefs_get,
+       .set    = bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
 const struct xattr_handler *bch2_xattr_handlers[] = {
        &bch_xattr_user_handler,
        &posix_acl_access_xattr_handler,
        &posix_acl_default_xattr_handler,
        &bch_xattr_trusted_handler,
        &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+       &bch_xattr_bcachefs_handler,
+#endif
        NULL
 };
 
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+       [BCH_XATTR_INDEX_USER]                  = &bch_xattr_user_handler,
+       [BCH_XATTR_INDEX_POSIX_ACL_ACCESS]      =
+               &posix_acl_access_xattr_handler,
+       [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT]     =
+               &posix_acl_default_xattr_handler,
+       [BCH_XATTR_INDEX_TRUSTED]               = &bch_xattr_trusted_handler,
+       [BCH_XATTR_INDEX_SECURITY]              = &bch_xattr_security_handler,
+};
+
 static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
 {
        return type < ARRAY_SIZE(bch_xattr_handler_map)
index f43566993171bf3c7f0641a1f7a06d996b6230aa..d8256989b99d7012aba6c20c8d3bfd3ce08f6fea 100644 (file)
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/export.h>
+
+static const struct {
+       int             err;
+       const char      *name;
+} blk_errors[] = {
+       [BLK_STS_OK]            = { 0,          "" },
+       [BLK_STS_NOTSUPP]       = { -EOPNOTSUPP, "operation not supported" },
+       [BLK_STS_TIMEOUT]       = { -ETIMEDOUT, "timeout" },
+       [BLK_STS_NOSPC]         = { -ENOSPC,    "critical space allocation" },
+       [BLK_STS_TRANSPORT]     = { -ENOLINK,   "recoverable transport" },
+       [BLK_STS_TARGET]        = { -EREMOTEIO, "critical target" },
+       [BLK_STS_NEXUS]         = { -EBADE,     "critical nexus" },
+       [BLK_STS_MEDIUM]        = { -ENODATA,   "critical medium" },
+       [BLK_STS_PROTECTION]    = { -EILSEQ,    "protection" },
+       [BLK_STS_RESOURCE]      = { -ENOMEM,    "kernel resource" },
+       [BLK_STS_AGAIN]         = { -EAGAIN,    "nonblocking retry" },
+
+       /* device mapper special case, should not leak out: */
+       [BLK_STS_DM_REQUEUE]    = { -EREMCHG, "dm internal retry" },
+
+       /* everything else not covered above: */
+       [BLK_STS_IOERR]         = { -EIO,       "I/O" },
+};
+
+int blk_status_to_errno(blk_status_t status)
+{
+       int idx = (__force int)status;
+
+       if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+               return -EIO;
+       return blk_errors[idx].err;
+}
 
 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                        struct bio *src, struct bvec_iter *src_iter)
@@ -199,8 +230,8 @@ static struct bio *__bio_chain_endio(struct bio *bio)
 {
        struct bio *parent = bio->bi_private;
 
-       if (!parent->bi_error)
-               parent->bi_error = bio->bi_error;
+       if (!parent->bi_status)
+               parent->bi_status = bio->bi_status;
        bio_put(bio);
        return parent;
 }
@@ -233,27 +264,6 @@ again:
                bio->bi_end_io(bio);
 }
 
-void bio_endio_nodec(struct bio *bio)
-{
-       goto nodec;
-
-       while (bio) {
-               if (unlikely(!bio_remaining_done(bio)))
-                       break;
-nodec:
-               if (bio->bi_end_io == bio_chain_endio) {
-                       struct bio *parent = bio->bi_private;
-                       parent->bi_error = bio->bi_error;
-                       bio_put(bio);
-                       bio = parent;
-               } else {
-                       if (bio->bi_end_io)
-                               bio->bi_end_io(bio);
-                       bio = NULL;
-               }
-       }
-}
-
 void bio_reset(struct bio *bio)
 {
        unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
index ea7db40b2fcc9f1b206cee5957ec5a5a6ac16574..156d5353b2fbfd676caed6bd47b66aa4a729cc52 100644 (file)
@@ -32,7 +32,7 @@ void generic_make_request(struct bio *bio)
                ret = fdatasync(bio->bi_bdev->bd_fd);
                if (ret) {
                        fprintf(stderr, "fsync error: %m\n");
-                       bio->bi_error = -EIO;
+                       bio->bi_status = BLK_STS_IOERR;
                        bio_endio(bio);
                        return;
                }
@@ -106,7 +106,7 @@ int submit_bio_wait(struct bio *bio)
        submit_bio(bio);
        wait_for_completion(&done);
 
-       return bio->bi_error;
+       return blk_status_to_errno(bio->bi_status);
 }
 
 int blkdev_issue_discard(struct block_device *bdev,
@@ -235,10 +235,8 @@ static int aio_completion_thread(void *arg)
                for (ev = events; ev < events + ret; ev++) {
                        struct bio *bio = (struct bio *) ev->data;
 
-                       if (ev->res < 0)
-                               bio->bi_error = ev->res;
-                       else if (ev->res != bio->bi_iter.bi_size)
-                               bio->bi_error = -EIO;
+                       if (ev->res != bio->bi_iter.bi_size)
+                               bio->bi_status = BLK_STS_IOERR;
 
                        bio_endio(bio);
                }