-e57b5958cf4e8530d26f7c36a6e1427fb284cc70
+14ce2a2031f3761a4b957aa2e5aac446ce18b87c
list_modes, "list mode");
break;
case 'f':
- opts.fix_errors = FSCK_ERR_YES;
- opts.norecovery = false;
+ opt_set(opts, fix_errors, FSCK_OPT_YES);
+ opt_set(opts, norecovery, false);
break;
case 'v':
- opts.verbose_recovery = true;
+ opt_set(opts, verbose_recovery, true);
break;
case 'h':
list_keys_usage();
int opt;
opt_set(opts, degraded, true);
+ opt_set(opts, fix_errors, FSCK_OPT_ASK);
while ((opt = getopt(argc, argv, "pynfvh")) != -1)
switch (opt) {
case 'p':
- opt_set(opts, fix_errors, FSCK_ERR_YES);
+ opt_set(opts, fix_errors, FSCK_OPT_YES);
break;
case 'y':
- opt_set(opts, fix_errors, FSCK_ERR_YES);
+ opt_set(opts, fix_errors, FSCK_OPT_YES);
break;
case 'n':
opt_set(opts, nochanges, true);
- opt_set(opts, fix_errors, FSCK_ERR_NO);
+ opt_set(opts, fix_errors, FSCK_OPT_NO);
break;
case 'f':
/* force check, even if filesystem marked clean: */
struct bch_inode_unpacked new_inode;
int ret;
- bch2_inode_init(c, &new_inode, uid, gid, mode, rdev);
+ bch2_inode_init(c, &new_inode, uid, gid, mode, rdev, parent);
ret = bch2_inode_create(c, &new_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
struct bch_inode_unpacked *dst_inode,
u64 dst_offset, void *buf, size_t len)
{
- struct disk_reservation res;
struct bch_write_op op;
struct bio_vec bv;
struct closure cl;
op.wbio.bio.bi_iter.bi_size = len;
bch2_bio_map(&op.wbio.bio, buf);
- int ret = bch2_disk_reservation_get(c, &res, len >> 9, 0);
+ bch2_write_op_init(&op, c);
+
+ op.write_point = writepoint_hashed(0);
+ op.pos = POS(dst_inode->bi_inum, dst_offset >> 9);
+
+ int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, 0);
if (ret)
die("error reserving space in new filesystem: %s", strerror(-ret));
- bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
- POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
closure_call(&op.cl, bch2_write, NULL, &cl);
closure_sync(&cl);
static inline int bioset_init(struct bio_set *bs,
unsigned pool_size,
- unsigned front_pad)
+ unsigned front_pad,
+ int flags)
{
bs->front_pad = front_pad;
return 0;
extern struct bio_set *bioset_create(unsigned int, unsigned int);
extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
+enum {
+ BIOSET_NEED_BVECS = 1 << 0,
+ BIOSET_NEED_RESCUER = 1 << 1,
+};
extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
extern void bio_put(struct bio *);
}
extern void bio_endio(struct bio *);
-extern void bio_endio_nodec(struct bio *);
-
-static inline void bio_io_error(struct bio *bio)
-{
- bio->bi_error = -EIO;
- bio_endio(bio);
-}
extern void bio_advance(struct bio *, unsigned);
struct bio;
struct block_device;
typedef void (bio_end_io_t) (struct bio *);
-typedef void (bio_destructor_t) (struct bio *);
+
+/*
+ * Block error status values. See block/blk-core:blk_errors for the details.
+ */
+typedef u8 __bitwise blk_status_t;
+#define BLK_STS_OK 0
+#define BLK_STS_NOTSUPP ((__force blk_status_t)1)
+#define BLK_STS_TIMEOUT ((__force blk_status_t)2)
+#define BLK_STS_NOSPC ((__force blk_status_t)3)
+#define BLK_STS_TRANSPORT ((__force blk_status_t)4)
+#define BLK_STS_TARGET ((__force blk_status_t)5)
+#define BLK_STS_NEXUS ((__force blk_status_t)6)
+#define BLK_STS_MEDIUM ((__force blk_status_t)7)
+#define BLK_STS_PROTECTION ((__force blk_status_t)8)
+#define BLK_STS_RESOURCE ((__force blk_status_t)9)
+#define BLK_STS_IOERR ((__force blk_status_t)10)
+
+/* hack for device mapper, don't use elsewhere: */
+#define BLK_STS_DM_REQUEUE ((__force blk_status_t)11)
+
+#define BLK_STS_AGAIN ((__force blk_status_t)12)
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
- int bi_error;
+ blk_status_t bi_status;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
#define capable(cap) true
+int blk_status_to_errno(blk_status_t status);
+blk_status_t errno_to_blk_status(int errno);
+
#endif /* __TOOLS_LINUX_BLKDEV_H */
#define BUG() do { assert(0); unreachable(); } while (0)
#define BUG_ON(cond) assert(!(cond))
-#define WARN_ON_ONCE(cond) assert(!(cond))
+#define WARN_ON_ONCE(cond) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define WARN_ONCE(cond, msg) ({ bool _r = (cond); if (_r) assert(0); _r; })
#define __WARN() assert(0)
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
const struct timespec64 rhs);
+static inline struct timespec timespec_trunc(struct timespec t, unsigned gran)
+{
+ /* Avoid division in the common cases 1 ns and 1 s. */
+ if (gran == 1) {
+ /* nothing */
+ } else if (gran == NSEC_PER_SEC) {
+ t.tv_nsec = 0;
+ } else if (gran > 1 && gran < NSEC_PER_SEC) {
+ t.tv_nsec -= t.tv_nsec % gran;
+ } else {
+ WARN(1, "illegal file time granularity: %u", gran);
+ }
+ return t;
+}
+
#endif /* _LINUX_TIME64_H */
if (ret < 0)
return ret;
else {
- inode->v.i_ctime =
- current_fs_time(inode->v.i_sb);
+ inode->v.i_ctime = current_time(&inode->v);
mark_inode_dirty(&inode->v);
if (ret == 0)
acl = NULL;
return;
a = bkey_s_c_to_alloc(k);
- ca = c->devs[a.k->p.inode];
+ ca = bch_dev_bkey_exists(c, a.k->p.inode);
if (a.k->p.offset >= ca->mi.nbuckets)
return;
bch2_alloc_read_key(c, bkey_i_to_s_c(k));
}
+ mutex_lock(&c->bucket_lock);
for_each_member_device(ca, c, i) {
bch2_recalc_min_prio(c, ca, READ);
bch2_recalc_min_prio(c, ca, WRITE);
}
+ mutex_unlock(&c->bucket_lock);
return 0;
}
if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
return 0;
- ca = c->devs[pos.inode];
+ ca = bch_dev_bkey_exists(c, pos.inode);
if (pos.offset >= ca->mi.nbuckets)
return 0;
/* Bucket heap / gen */
-void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
{
struct prio_clock *clock = &c->prio_clock[rw];
struct bucket *g;
void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
for (i = wp->nr_ptrs - 1; i >= 0; --i) {
struct open_bucket *ob = wp->ptrs[i];
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
BUG_ON(ca->open_buckets_partial_nr >=
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
BUG_ON(ptr_stale(ca, &ob->ptr));
}
for (i = 0; i < wp->nr_ptrs_can_use; i++) {
struct open_bucket *ob = wp->ptrs[i];
- struct bch_dev *ca = c->devs[ob->ptr.dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
struct bch_extent_ptr tmp = ob->ptr;
EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
ra_pages += bdi->ra_pages;
}
- c->bdi.ra_pages = ra_pages;
+ bch2_set_ra_pages(c, ra_pages);
/* Find fastest, slowest tiers with devices: */
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
+ struct completion ref_completion;
struct percpu_ref io_ref;
- struct completion stop_complete;
- struct completion offline_complete;
+ struct completion io_ref_completion;
struct bch_fs *fs;
struct closure sb_write;
struct mutex sb_lock;
- struct backing_dev_info bdi;
-
/* BTREE CACHE */
struct bio_set btree_read_bio;
struct btree_root btree_roots[BTREE_ID_NR];
+ bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
#undef BCH_TIME_STAT
};
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
+{
+#ifndef NO_BCACHEFS_FS
+ if (c->vfs_sb)
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
+}
+
static inline bool bch2_fs_running(struct bch_fs *c)
{
return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
} __attribute__((packed, aligned(8)));
BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION);
-#define BCH_INODE_FIELDS() \
- BCH_INODE_FIELD(bi_atime, 64) \
- BCH_INODE_FIELD(bi_ctime, 64) \
- BCH_INODE_FIELD(bi_mtime, 64) \
- BCH_INODE_FIELD(bi_otime, 64) \
- BCH_INODE_FIELD(bi_size, 64) \
- BCH_INODE_FIELD(bi_sectors, 64) \
- BCH_INODE_FIELD(bi_uid, 32) \
- BCH_INODE_FIELD(bi_gid, 32) \
- BCH_INODE_FIELD(bi_nlink, 32) \
- BCH_INODE_FIELD(bi_generation, 32) \
- BCH_INODE_FIELD(bi_dev, 32)
+#define BCH_INODE_FIELDS() \
+ BCH_INODE_FIELD(bi_atime, 64) \
+ BCH_INODE_FIELD(bi_ctime, 64) \
+ BCH_INODE_FIELD(bi_mtime, 64) \
+ BCH_INODE_FIELD(bi_otime, 64) \
+ BCH_INODE_FIELD(bi_size, 64) \
+ BCH_INODE_FIELD(bi_sectors, 64) \
+ BCH_INODE_FIELD(bi_uid, 32) \
+ BCH_INODE_FIELD(bi_gid, 32) \
+ BCH_INODE_FIELD(bi_nlink, 32) \
+ BCH_INODE_FIELD(bi_generation, 32) \
+ BCH_INODE_FIELD(bi_dev, 32) \
+ BCH_INODE_FIELD(bi_data_checksum, 8) \
+ BCH_INODE_FIELD(bi_compression, 8)
+
+#define BCH_INODE_FIELDS_INHERIT() \
+ BCH_INODE_FIELD(bi_data_checksum) \
+ BCH_INODE_FIELD(bi_compression)
enum {
/*
__u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
__u8 nr_superblocks;
__u8 pad[5];
- __u64 sb_offset[61];
+ __le64 sb_offset[61];
} __attribute__((packed, aligned(8)));
#define BCH_SB_LAYOUT_SECTOR 7
};
};
+struct jset_entry_blacklist {
+ struct jset_entry entry;
+ __le64 seq;
+};
+
#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
enum {
#include "bcachefs.h"
#include "bkey.h"
+#include "bkey_methods.h"
#include "bset.h"
#include "util.h"
const struct bkey_format *format) {}
#endif
-int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
-{
- char *out = buf, *end = buf + size;
-
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
- p("u64s %u type %u %llu:%llu snap %u len %u ver %llu",
- k->u64s, k->type, k->p.inode, k->p.offset,
- k->p.snapshot, k->size, k->version.lo);
-
- BUG_ON(bkey_packed(k));
-
- switch (k->type) {
- case KEY_TYPE_DELETED:
- p(" deleted");
- break;
- case KEY_TYPE_DISCARD:
- p(" discard");
- break;
- case KEY_TYPE_ERROR:
- p(" error");
- break;
- case KEY_TYPE_COOKIE:
- p(" cookie");
- break;
- }
-#undef p
-
- return out - buf;
-}
-
struct pack_state {
const struct bkey_format *format;
unsigned bits; /* bits remaining in current word */
* Extents - we have to guarantee that if an extent is packed, a trimmed
* version will also pack:
*/
- if (bkey_start_offset(in) < format->field_offset[BKEY_FIELD_OFFSET])
+ if (bkey_start_offset(in) <
+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
return false;
pack_state_finish(&state, out);
bool *eax_zeroed)
{
unsigned bits = format->bits_per_field[field];
- u64 offset = format->field_offset[field];
+ u64 offset = le64_to_cpu(format->field_offset[field]);
unsigned i, byte, bit_offset, align, shl, shr;
if (!bits && !offset) {
#include "vstructs.h"
void bch2_to_binary(char *, const u64 *, unsigned);
-int bch2_bkey_to_text(char *, size_t, const struct bkey *);
#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
enum bch_bkey_fields nr)
{
return f->bits_per_field[nr] < 64
- ? f->field_offset[nr] + ~(~0ULL << f->bits_per_field[nr])
+ ? (le64_to_cpu(f->field_offset[nr]) +
+ ~(~0ULL << f->bits_per_field[nr]))
: U64_MAX;
}
[BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops,
};
-/* Returns string indicating reason for being invalid, or NULL if valid: */
-const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
- struct bkey_s_c k)
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
- if (k.k->u64s < BKEY_U64s)
- return "u64s too small";
-
- if (!ops->is_extents) {
- if (k.k->size)
- return "nonzero size field";
- } else {
- if ((k.k->size == 0) != bkey_deleted(k.k))
- return "bad size field";
- }
-
- if (ops->is_extents &&
- !k.k->size &&
- !bkey_deleted(k.k))
- return "zero size field";
-
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
}
}
-const char *bch2_btree_bkey_invalid(struct bch_fs *c, struct btree *b,
- struct bkey_s_c k)
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
+{
+ const struct bkey_ops *ops = bch2_bkey_ops[type];
+
+ if (k.k->u64s < BKEY_U64s)
+ return "u64s too small";
+
+ if (!ops->is_extents) {
+ if (k.k->size)
+ return "nonzero size field";
+ } else {
+ if ((k.k->size == 0) != bkey_deleted(k.k))
+ return "bad size field";
+ }
+
+ if (ops->is_extents &&
+ !k.k->size &&
+ !bkey_deleted(k.k))
+ return "zero size field";
+
+ if (k.k->p.snapshot)
+ return "nonzero snapshot";
+
+ return NULL;
+}
+
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+ struct bkey_s_c k)
+{
+ return __bch2_bkey_invalid(c, type, k) ?:
+ bch2_bkey_val_invalid(c, type, k);
+}
+
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
- if (k.k->p.snapshot)
- return "nonzero snapshot";
-
- return bch2_bkey_invalid(c, btree_node_type(b), k);
+ return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
BUG_ON(!k.k->u64s);
- invalid = bch2_btree_bkey_invalid(c, b, k);
+ invalid = bch2_bkey_invalid(c, type, k) ?:
+ bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
ops->key_debugcheck(c, b, k);
}
-char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
+
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
- const struct bkey_ops *ops = bch2_bkey_ops[type];
+ char *out = buf, *end = buf + size;
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text)
- ops->val_to_text(c, buf, size, k);
+ p("u64s %u type %u ", k->u64s, k->type);
+
+ if (bkey_cmp(k->p, POS_MAX))
+ p("%llu:%llu", k->p.inode, k->p.offset);
+ else
+ p("POS_MAX");
- return buf;
+ p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+
+ return out - buf;
}
-char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
- out += bch2_bkey_to_text(out, end - out, k.k);
-
- if (k.k->type >= KEY_TYPE_GENERIC_NR &&
- ops->val_to_text) {
- out += scnprintf(out, end - out, ": ");
- ops->val_to_text(c, out, end - out, k);
+ switch (k.k->type) {
+ case KEY_TYPE_DELETED:
+ p(" deleted");
+ break;
+ case KEY_TYPE_DISCARD:
+ p(" discard");
+ break;
+ case KEY_TYPE_ERROR:
+ p(" error");
+ break;
+ case KEY_TYPE_COOKIE:
+ p(" cookie");
+ break;
+ default:
+ if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+ ops->val_to_text(c, buf, size, k);
+ break;
}
- return buf;
+ return out - buf;
+}
+
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
+{
+ char *out = buf, *end = buf + size;
+
+ out += bch2_bkey_to_text(out, end - out, k.k);
+ out += scnprintf(out, end - out, ": ");
+ out += bch2_val_to_text(c, type, out, end - out, k);
+
+ return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,
bool is_extents;
};
+const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
+ struct bkey_s_c);
+const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
-const char *bch2_btree_bkey_invalid(struct bch_fs *, struct btree *,
- struct bkey_s_c);
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
-char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
+
+int bch2_bkey_to_text(char *, size_t, const struct bkey *);
+int bch2_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
+int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t b = PTR_BUCKET_NR(ca, ptr);
if (gen_after(ca->oldest_gens[b], ptr->gen))
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
(!c->opts.nofsck &&
fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c,
- "superblock not marked as containing replicas"))) {
+ "superblock not marked as containing replicas (type %u)",
+ data_type))) {
ret = bch2_check_mark_super(c, e, data_type);
if (ret)
return ret;
}
extent_for_each_ptr(e, ptr) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr);
if (mustfix_fsck_err_on(!g->mark.gen_valid, c,
lockdep_assert_held(&c->sb_lock);
for (i = 0; i < layout->nr_superblocks; i++) {
- if (layout->sb_offset[i] == BCH_SB_SECTOR)
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
+
+ if (offset == BCH_SB_SECTOR)
mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
BUCKET_SB, flags);
- mark_metadata_sectors(c, ca,
- layout->sb_offset[i],
- layout->sb_offset[i] +
- (1 << layout->sb_max_size_bits),
+ mark_metadata_sectors(c, ca, offset,
+ offset + (1 << layout->sb_max_size_bits),
BUCKET_SB, flags);
}
spin_lock(&ob->lock);
if (ob->valid) {
gc_pos_set(c, gc_pos_alloc(c, ob));
- ca = c->devs[ob->ptr.dev];
+ ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
}
}
-void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket *g;
struct bset_tree *t;
struct bset *start_bset = bset(b, &b->set[start_idx]);
bool used_mempool = false;
- u64 start_time;
+ u64 start_time, seq = 0;
unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1;
bool sorting_entire_node = start_idx == 0 &&
end_idx == b->nsets;
bch2_time_stats_update(&c->btree_sort_time, start_time);
/* Make sure we preserve bset journal_seq: */
- for (t = b->set + start_idx + 1;
- t < b->set + end_idx;
- t++)
- start_bset->journal_seq =
- max(start_bset->journal_seq,
- bset(b, t)->journal_seq);
+ for (t = b->set + start_idx; t < b->set + end_idx; t++)
+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
+ start_bset->journal_seq = cpu_to_le64(seq);
if (sorting_entire_node) {
unsigned u64s = le16_to_cpu(out->keys.u64s);
{
struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN;
+ enum bkey_type type = btree_node_type(b);
bool seen_non_whiteout = false;
const char *err;
int ret = 0;
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
- whiteout_u64s = 0;
+ *whiteout_u64s = 0;
}
for (k = i->start;
}
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(btree_node_type(b), &b->format, k);
+ bch2_bkey_swab(type, &b->format, k);
u = bkey_disassemble(b, k, &tmp);
- invalid = bch2_btree_bkey_invalid(c, b, u);
+ invalid = __bch2_bkey_invalid(c, type, u) ?:
+ bch2_bkey_in_btree_node(b, u) ?:
+ (write ? bch2_bkey_val_invalid(c, type, u) : NULL);
if (invalid) {
char buf[160];
- bch2_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), u);
+ bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
struct btree_node_entry *bne;
struct btree_node_iter *iter;
struct btree_node *sorted;
+ struct bkey_packed *k;
+ struct bset *i;
bool used_mempool;
unsigned u64s;
int ret, retry_read = 0, write = READ;
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
- struct bset *i;
if (!b->written) {
i = &b->data->keys;
btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
+ i = &b->data->keys;
+ for (k = i->start; k != vstruct_last(i);) {
+ enum bkey_type type = btree_node_type(b);
+ struct bkey tmp;
+ struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
+ const char *invalid = bch2_bkey_val_invalid(c, type, u);
+
+ if (invalid) {
+ char buf[160];
+
+ bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
+ btree_err(BTREE_ERR_FIXABLE, c, b, i,
+ "invalid bkey %s: %s", buf, invalid);
+
+ btree_keys_account_key_drop(&b->nr, 0, k);
+
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
+ memmove_u64s_down(k, bkey_next(k),
+ (u64 *) vstruct_end(i) - (u64 *) k);
+ continue;
+ }
+
+ k = bkey_next(k);
+ }
+
bch2_bset_build_aux_tree(b, b->set, false);
set_needs_whiteout(btree_bset_first(b));
bio->bi_iter.bi_size = btree_bytes(c);
submit_bio_wait(bio);
start:
- bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+ bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
percpu_ref_put(&rb->pick.ca->io_ref);
__set_bit(rb->pick.ca->dev_idx, avoid.d);
rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
- if (!bio->bi_error &&
+ if (!bio->bi_status &&
!bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
goto out;
} while (!IS_ERR_OR_NULL(rb->pick.ca));
BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
bch2_btree_node_read(c, b, true);
- six_unlock_write(&b->lock);
if (btree_node_read_error(b)) {
- six_unlock_intent(&b->lock);
- return -EIO;
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ mutex_lock(&c->btree_cache.lock);
+ list_move(&b->list, &c->btree_cache.freeable);
+ mutex_unlock(&c->btree_cache.lock);
+
+ ret = -EIO;
+ goto err;
}
bch2_btree_set_root_for_read(c, b);
+err:
+ six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
- return 0;
+ return ret;
}
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct closure *cl = wbio->cl;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct btree_iter iter;
+ int ret;
- six_lock_read(&b->lock);
- bkey_copy(&tmp.k, &b->key);
- six_unlock_read(&b->lock);
+ __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH,
+ b->level, 0);
+retry:
+ ret = bch2_btree_iter_traverse(&iter);
+ if (ret)
+ goto err;
- if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
- /* Node has been freed: */
+ /* has node been freed? */
+ if (iter.nodes[b->level] != b) {
+ /* node has been freed: */
+ if (!btree_node_dying(b))
+ panic("foo4\n");
goto out;
}
- new_key = bkey_i_to_extent(&tmp.k);
+ if (!btree_node_hashed(b))
+ panic("foo5\n");
- while (wbio->replicas_failed) {
- unsigned idx = __fls(wbio->replicas_failed);
+ bkey_copy(&tmp.k, &b->key);
- bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
- wbio->replicas_failed ^= 1 << idx;
- }
+ new_key = bkey_i_to_extent(&tmp.k);
+ e = extent_i_to_s(new_key);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+ bch2_extent_drop_ptr(e, ptr);
- if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
- bch2_btree_node_update_key(c, b, new_key)) {
- set_btree_node_noevict(b);
- bch2_fatal_error(c);
- }
+ if (!bch2_extent_nr_ptrs(e.c))
+ goto err;
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR)
+ goto retry;
+ if (ret)
+ goto err;
out:
+ bch2_btree_iter_unlock(&iter);
bio_put(&wbio->bio);
btree_node_write_done(c, b);
if (cl)
closure_put(cl);
+ return;
+err:
+ set_btree_node_noevict(b);
+ bch2_fs_fatal_error(c, "fatal error writing btree node");
+ goto out;
}
void bch2_btree_write_error_work(struct work_struct *work)
struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
struct bch_dev *ca = wbio->ca;
+ unsigned long flags;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
- if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
- bch2_meta_write_fault("btree"))
- set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
+ if (bio->bi_status == BLK_STS_REMOVED ||
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
+ bch2_meta_write_fault("btree")) {
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ }
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
wbio->used_mempool,
wbio->data);
- if (wbio->replicas_failed) {
- unsigned long flags;
-
+ if (wbio->failed.nr) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bio_list_add(&c->btree_write_error_list, &wbio->bio);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
queue_work(c->wq, &c->btree_write_error_work);
return;
}
wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
wbio->cl = parent;
+ wbio->failed.nr = 0;
wbio->order = order;
wbio->used_mempool = used_mempool;
wbio->data = data;
{
struct btree_iter *linked;
struct btree *b = iter->nodes[level];
- enum btree_node_locked_type want = btree_lock_want(iter, level);
- enum btree_node_locked_type have = btree_node_locked_type(iter, level);
+ int want = btree_lock_want(iter, level);
+ int have = btree_node_locked_type(iter, level);
if (want == have)
return true;
return true;
}
+bool bch2_btree_iter_relock(struct btree_iter *iter)
+{
+ unsigned l;
+
+ for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
+ if (!bch2_btree_node_relock(iter, l))
+ return false;
+
+ return true;
+}
+
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
unsigned new_locks_want)
{
struct btree_iter *linked;
- unsigned l;
/* Drop locks we don't want anymore: */
if (new_locks_want < iter->locks_want)
iter->locks_want = new_locks_want;
btree_iter_drop_extra_locks(iter);
- for (l = iter->level; l < iter->locks_want && iter->nodes[l]; l++)
- if (!bch2_btree_node_relock(iter, l))
- goto fail;
+ if (bch2_btree_iter_relock(iter))
+ return true;
- return true;
-fail:
/*
* Just an optimization: ancestor nodes must be locked before child
* nodes, so set locks_want on iterators that might lock ancestors
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
-static inline int btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
}
bool bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
+ BTREE_NODE_dying,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
+BTREE_FLAG(dying);
static inline struct btree_write *btree_current_write(struct btree *b)
{
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
-int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
- struct bkey_i_extent *);
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
+ struct btree *, struct bkey_i_extent *);
#endif /* _BCACHEFS_BTREE_UPDATE_H */
static void btree_node_will_make_reachable(struct btree_update *,
struct btree *);
static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *);
+static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
/* Debug code: */
BUG_ON(c->btree_roots[b->btree_id].as != as);
c->btree_roots[b->btree_id].as = NULL;
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, WRITE);
/*
* We don't have to wait anything anything here (before
struct btree_write *w;
struct bset_tree *t;
+ set_btree_node_dying(b);
btree_interior_update_add_node_reference(as, b);
/*
* in with keys that aren't in the journal anymore:
*/
for_each_bset(b, t)
- as->journal_seq = max(as->journal_seq, bset(b, t)->journal_seq);
+ as->journal_seq = max(as->journal_seq,
+ le64_to_cpu(bset(b, t)->journal_seq));
mutex_lock(&c->btree_interior_update_lock);
mutex_unlock(&c->btree_cache.lock);
mutex_lock(&c->btree_root_lock);
+ BUG_ON(btree_node_root(c, b) &&
+ (b->level < btree_node_root(c, b)->level ||
+ !btree_node_dying(btree_node_root(c, b))));
+
btree_node_root(c, b) = b;
mutex_unlock(&c->btree_root_lock);
gc_pos_btree_root(b->btree_id));
}
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b)
+static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
{
struct btree_root *r = &c->btree_roots[b->btree_id];
bkey_copy(&r->key, &b->key);
r->level = b->level;
r->alive = true;
+ if (rw == WRITE)
+ c->btree_roots_dirty = true;
mutex_unlock(&c->btree_root_lock);
}
return ret;
}
-int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b,
- struct bkey_i_extent *new_key)
+static void __bch2_btree_node_update_key(struct bch_fs *c,
+ struct btree_update *as,
+ struct btree_iter *iter,
+ struct btree *b, struct btree *new_hash,
+ struct bkey_i_extent *new_key)
{
- struct btree_update *as = NULL;
- struct btree *parent, *new_hash = NULL;
- struct btree_iter iter;
- struct closure cl;
+ struct btree *parent;
bool must_rewrite_parent = false;
int ret;
- __bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
- BTREE_MAX_DEPTH,
- b->level, 0);
- closure_init_stack(&cl);
-
- ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
- if (ret)
- return ret;
-
-retry:
- down_read(&c->gc_lock);
- ret = bch2_btree_iter_traverse(&iter);
- if (ret)
- goto err;
-
- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
- if (!new_hash &&
- PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
- /* bch2_btree_reserve_get will unlock */
- do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
- closure_sync(&cl);
- } while (ret == -EAGAIN);
-
- BUG_ON(ret);
-
- new_hash = bch2_btree_node_mem_alloc(c);
- }
-
- as = bch2_btree_update_start(c, iter.btree_id,
- btree_update_reserve_required(c, b),
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_USE_RESERVE|
- BTREE_INSERT_USE_ALLOC_RESERVE,
- &cl);
- if (IS_ERR(as)) {
- ret = PTR_ERR(as);
- if (ret == -EAGAIN || ret == -EINTR) {
- bch2_btree_iter_unlock(&iter);
- up_read(&c->gc_lock);
- closure_sync(&cl);
- goto retry;
- }
- goto err;
- }
-
- mutex_lock(&c->btree_interior_update_lock);
-
/*
* Two corner cases that need to be thought about here:
*
if (b->will_make_reachable)
must_rewrite_parent = true;
- /* other case: btree node being freed */
- if (iter.nodes[b->level] != b) {
- /* node has been freed: */
- BUG_ON(btree_node_hashed(b));
- mutex_unlock(&c->btree_interior_update_lock);
- goto err;
- }
-
- mutex_unlock(&c->btree_interior_update_lock);
-
if (must_rewrite_parent)
as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE;
btree_interior_update_add_node_reference(as, b);
- parent = iter.nodes[b->level + 1];
+ parent = iter->nodes[b->level + 1];
if (parent) {
if (new_hash) {
bkey_copy(&new_hash->key, &new_key->k_i);
BUG_ON(ret);
}
- bch2_btree_insert_node(as, parent, &iter,
- &keylist_single(&new_key->k_i));
+ bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
BUG_ON(btree_node_root(c, b) != b);
- bch2_btree_node_lock_write(b, &iter);
+ bch2_btree_node_lock_write(b, iter);
bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
c->opts.btree_node_size, true,
&stats);
bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
gc_pos_btree_root(b->btree_id));
- bkey_copy(&b->key, &new_key->k_i);
+
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, &new_key->k_i);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ } else {
+ bkey_copy(&b->key, &new_key->k_i);
+ }
btree_update_updated_root(as);
- bch2_btree_node_unlock_write(b, &iter);
+ bch2_btree_node_unlock_write(b, iter);
}
bch2_btree_update_done(as);
-out:
+}
+
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
+ struct btree *b, struct bkey_i_extent *new_key)
+{
+ struct btree_update *as = NULL;
+ struct btree *new_hash = NULL;
+ struct closure cl;
+ int ret;
+
+ closure_init_stack(&cl);
+
+ if (!down_read_trylock(&c->gc_lock)) {
+ bch2_btree_iter_unlock(iter);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter)) {
+ ret = -EINTR;
+ goto err;
+ }
+ }
+
+ /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
+ if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /* bch2_btree_reserve_get will unlock */
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ if (ret) {
+ ret = -EINTR;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ new_hash = bch2_btree_node_mem_alloc(c);
+ }
+
+ as = bch2_btree_update_start(c, iter->btree_id,
+ btree_update_reserve_required(c, b),
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE,
+ &cl);
+ if (IS_ERR(as)) {
+ ret = PTR_ERR(as);
+ if (ret == -EAGAIN)
+ ret = -EINTR;
+
+ if (ret != -EINTR)
+ goto err;
+
+ bch2_btree_iter_unlock(iter);
+ up_read(&c->gc_lock);
+ closure_sync(&cl);
+ down_read(&c->gc_lock);
+
+ if (!bch2_btree_iter_relock(iter))
+ goto err;
+ }
+
+ ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE);
+ if (ret)
+ goto err_free_update;
+
+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+err:
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
list_move(&new_hash->list, &c->btree_cache.freeable);
six_unlock_write(&new_hash->lock);
six_unlock_intent(&new_hash->lock);
}
- bch2_btree_iter_unlock(&iter);
up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
-err:
- if (as)
- bch2_btree_update_free(as);
- goto out;
+err_free_update:
+ bch2_btree_update_free(as);
+ goto err;
}
/* Init code: */
BUG_ON(btree_node_root(c, b));
__bch2_btree_set_root_inmem(c, b);
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, READ);
}
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
BUG_ON(btree_node_root(c, b));
bch2_btree_set_root_inmem(as, b);
- bch2_btree_set_root_ondisk(c, b);
+ bch2_btree_set_root_ondisk(c, b, WRITE);
bch2_btree_open_bucket_put(c, b);
six_unlock_intent(&b->lock);
#define bch2_usage_read_raw(_stats) \
({ \
- typeof(*this_cpu_ptr(_stats)) _acc = { 0 }; \
+ typeof(*this_cpu_ptr(_stats)) _acc; \
int cpu; \
\
+ memset(&_acc, 0, sizeof(_acc)); \
+ \
for_each_possible_cpu(cpu) \
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
\
{
struct bucket_mark old, new;
unsigned saturated;
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
unsigned data_type = type == S_META
? BUCKET_BTREE : BUCKET_DATA;
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
-
/* _uncompressed_ sectors: */
+ u64 online_reserved;
+ u64 available_cache;
struct {
u64 data[S_ALLOC_NR];
u64 persistent_reserved;
} s[BCH_REPLICAS_MAX];
-
- u64 online_reserved;
- u64 available_cache;
};
/*
#include "bcachefs.h"
#include "bcachefs_ioctl.h"
+#include "chardev.h"
#include "super.h"
#include "super-io.h"
return ERR_PTR(-EINVAL);
rcu_read_lock();
- ca = c->devs[dev];
+ ca = rcu_dereference(c->devs[dev]);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
- if (copy_from_user(user_devs, arg.devs,
+ if (copy_from_user(user_devs, user_arg->devs,
sizeof(u64) * arg.nr_devs))
goto err;
}
}
-static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
+ unsigned opt)
{
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
- return bch2_csum_opt_to_type(c->opts.data_checksum, true);
+ return bch2_csum_opt_to_type(opt, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
return nonce;
}
+static inline struct nonce null_nonce(void)
+{
+ struct nonce ret;
+
+ memset(&ret, 0, sizeof(ret));
+ return ret;
+}
+
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args);
+ if (c->opts.fix_errors == FSCK_OPT_EXIT) {
+ bch_err(c, "%s, exiting", buf);
+ mutex_unlock(&c->fsck_error_lock);
+ return FSCK_ERR_EXIT;
+ }
+
if (flags & FSCK_CAN_FIX) {
- if (c->opts.fix_errors == FSCK_ERR_ASK) {
+ if (c->opts.fix_errors == FSCK_OPT_ASK) {
printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn();
- } else if (c->opts.fix_errors == FSCK_ERR_YES ||
+ } else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
if (print)
};
enum fsck_err_opts {
- FSCK_ERR_NO,
- FSCK_ERR_YES,
- FSCK_ERR_ASK,
+ FSCK_OPT_EXIT,
+ FSCK_OPT_YES,
+ FSCK_OPT_NO,
+ FSCK_OPT_ASK,
};
enum fsck_err_ret {
#define bcache_io_error(c, bio, fmt, ...) \
do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
- (bio)->bi_error = -EIO; \
+ (bio)->bi_status = BLK_STS_IOERR; \
} while (0)
#endif /* _BCACHEFS_ERROR_H */
#include "extents.h"
#include "inode.h"
#include "journal.h"
+#include "super.h"
#include "super-io.h"
#include "util.h"
#include "xattr.h"
return nr_ptrs;
}
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *c, struct bkey_s_c_extent e)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr_ptrs = 0;
+
+ extent_for_each_ptr(e, ptr)
+ nr_ptrs += (!ptr->cached &&
+ bch_dev_bkey_exists(c, ptr->dev)->mi.state !=
+ BCH_MEMBER_STATE_FAILED);
+
+ return nr_ptrs;
+}
+
unsigned bch2_extent_is_compressed(struct bkey_s_c k)
{
struct bkey_s_c_extent e;
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr)
{
- return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+ return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
}
static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
break;
case BCH_EXTENT_ENTRY_crc128:
- entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
- entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+ entry->crc128.csum.hi = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.hi);
+ entry->crc128.csum.lo = (__force __le64)
+ swab64((__force u64) entry->crc128.csum.lo);
break;
case BCH_EXTENT_ENTRY_ptr:
break;
const struct bch_extent_ptr *ptr2;
struct bch_dev *ca;
- if (ptr->dev >= c->sb.nr_devices)
+ if (ptr->dev >= c->sb.nr_devices ||
+ !c->devs[ptr->dev])
return "pointer to invalid device";
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (!ca)
return "pointer to invalid device";
break;
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
- ca = c->devs[ptr->dev];
+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+ ? bch_dev_bkey_exists(c, ptr->dev)
+ : NULL;
p("ptr: %u:%llu gen %u%s", ptr->dev,
(u64) ptr->offset, ptr->gen,
struct bch_extent_crc_unpacked crc;
extent_for_each_ptr_crc(e, ptr, crc) {
- struct bch_dev *ca = c->devs[ptr->dev];
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr->cached && ptr_stale(ca, ptr))
continue;
bool bad;
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
extent_for_each_ptr(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
g = PTR_BUCKET(ca, ptr);
replicas++;
ptrs_per_tier[ca->mi.tier]++;
static unsigned PTR_TIER(struct bch_fs *c,
const struct bch_extent_ptr *ptr)
{
- return c->devs[ptr->dev]->mi.tier;
+ return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
}
static void bch2_extent_crc_init(union bch_extent_crc *crc,
struct bkey_s_extent e)
{
struct bch_extent_ptr *ptr;
- unsigned tier = 0, nr_cached = 0, nr_good = 0;
+ unsigned tier = 0, nr_cached = 0;
+ unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached &&
- c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
- nr_good++;
-
if (nr_good <= c->opts.data_replicas)
return;
return BCH_MERGE_NOMERGE;
/* We don't allow extents to straddle buckets: */
- ca = c->devs[lp->dev];
+ ca = bch_dev_bkey_exists(c, lp->dev);
if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
return BCH_MERGE_NOMERGE;
}
}
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+ struct btree_iter iter;
+ struct bpos end = pos;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ end.offset += size;
+
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+ BTREE_ITER_WITH_HOLES, k) {
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+ break;
+
+ if (!bch2_extent_is_fully_allocated(k)) {
+ ret = -ENOSPC;
+ break;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ return ret;
+}
+
const struct bkey_ops bch2_bkey_extent_ops = {
.key_invalid = bch2_extent_invalid,
.key_debugcheck = bch2_extent_debugcheck,
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_nr_good_ptrs(struct bch_fs *, struct bkey_s_c_extent);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
case BCH_EXTENT_CRC32:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
- .csum.lo = crc->crc32.csum,
+ .csum.lo = (__force __le64) crc->crc32.csum,
};
case BCH_EXTENT_CRC64:
return (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
- .csum.lo = crc->crc64.csum_lo,
- .csum.hi = crc->crc64.csum_hi,
+ .csum.lo = (__force __le64) crc->crc64.csum_lo,
+ .csum.hi = (__force __le64) crc->crc64.csum_hi,
};
case BCH_EXTENT_CRC128:
return (struct bch_extent_crc_unpacked) {
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);
+int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+
#endif /* _BCACHEFS_EXTENTS_H */
struct i_sectors_hook {
struct extent_insert_hook hook;
- s64 sectors;
struct bch_inode_info *inode;
+ s64 sectors;
+ u64 new_i_size;
+ unsigned flags;
+ unsigned appending:1;
};
struct bchfs_write_op {
struct bch_write_op op;
};
-static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
- struct bch_inode_info *inode,
- bool is_dio)
-{
- op->inode = inode;
- op->sectors_added = 0;
- op->is_dio = is_dio;
- op->unalloc = false;
- op->new_i_size = U64_MAX;
-}
-
struct bch_writepage_io {
struct closure cl;
struct closure cl;
struct kiocb *req;
struct bch_fs *c;
- long written;
- long error;
loff_t offset;
- struct disk_reservation res;
-
struct iovec *iovec;
struct iovec inline_vecs[UIO_FASTIOV];
struct iov_iter iter;
lockdep_assert_held(&inode->ei_update_lock);
bi->bi_size = *new_i_size;
-
- if (atomic_long_read(&inode->ei_size_dirty_count))
- bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
- else
- bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
return 0;
}
return __bch2_write_inode(c, inode, inode_set_size, &new_size);
}
-static inline void i_size_dirty_put(struct bch_inode_info *inode)
+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
{
- atomic_long_dec_bug(&inode->ei_size_dirty_count);
+ inode->v.i_blocks += sectors;
}
-static inline void i_size_dirty_get(struct bch_inode_info *inode)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, int sectors)
{
- lockdep_assert_held(&inode->v.i_rwsem);
-
- atomic_long_inc(&inode->ei_size_dirty_count);
+ mutex_lock(&inode->ei_update_lock);
+ __i_sectors_acct(c, inode, sectors);
+ mutex_unlock(&inode->ei_update_lock);
}
/* i_sectors accounting: */
int sign = bkey_extent_is_allocation(&insert->k) -
(k.k && bkey_extent_is_allocation(k.k));
- EBUG_ON(!(h->inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY));
- EBUG_ON(!atomic_long_read(&h->inode->ei_sectors_dirty_count));
+ EBUG_ON(!(h->inode->ei_inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY));
h->sectors += sectors * sign;
return BTREE_INSERT_OK;
}
-static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi, void *p)
-{
- BUG_ON(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY);
-
- bi->bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
- return 0;
-}
-
-static int inode_clear_i_sectors_dirty(struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
+static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
{
- BUG_ON(!(bi->bi_flags & BCH_INODE_I_SECTORS_DIRTY));
+ struct i_sectors_hook *h = p;
- bi->bi_sectors = atomic64_read(&inode->ei_sectors);
- bi->bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+ if (h->new_i_size != U64_MAX &&
+ (!h->appending ||
+ h->new_i_size > bi->bi_size))
+ bi->bi_size = h->new_i_size;
+ bi->bi_sectors += h->sectors;
+ bi->bi_flags &= ~h->flags;
return 0;
}
-static void i_sectors_dirty_put(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct i_sectors_hook *h)
+static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
{
- if (h->sectors) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += h->sectors;
- spin_unlock(&inode->v.i_lock);
+ int ret;
- atomic64_add(h->sectors, &inode->ei_sectors);
- EBUG_ON(atomic64_read(&inode->ei_sectors) < 0);
- }
+ mutex_lock(&h->inode->ei_update_lock);
+ if (h->new_i_size != U64_MAX)
+ i_size_write(&h->inode->v, h->new_i_size);
- EBUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count) <= 0);
+ __i_sectors_acct(c, h->inode, h->sectors);
- mutex_lock(&inode->ei_update_lock);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+ mutex_unlock(&h->inode->ei_update_lock);
- if (atomic_long_dec_and_test(&inode->ei_sectors_dirty_count)) {
- int ret = __bch2_write_inode(c, inode,
- inode_clear_i_sectors_dirty, NULL);
+ h->sectors = 0;
- ret = ret;
- }
-
- mutex_unlock(&inode->ei_update_lock);
+ return ret;
}
-static int __must_check i_sectors_dirty_get(struct bch_fs *c,
- struct bch_inode_info *inode,
- struct i_sectors_hook *h)
+static int i_sectors_dirty_start_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi, void *p)
{
- int ret = 0;
+ struct i_sectors_hook *h = p;
- h->hook.fn = i_sectors_hook_fn;
- h->sectors = 0;
-#ifdef CONFIG_BCACHEFS_DEBUG
- h->inode = inode;
-#endif
+ if (h->flags & BCH_INODE_I_SIZE_DIRTY)
+ bi->bi_size = h->new_i_size;
- if (atomic_long_inc_not_zero(&inode->ei_sectors_dirty_count))
- return 0;
-
- mutex_lock(&inode->ei_update_lock);
-
- if (!(inode->ei_flags & BCH_INODE_I_SECTORS_DIRTY))
- ret = __bch2_write_inode(c, inode, inode_set_i_sectors_dirty,
- NULL);
+ bi->bi_flags |= h->flags;
+ return 0;
+}
- if (!ret)
- atomic_long_inc(&inode->ei_sectors_dirty_count);
+static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h)
+{
+ int ret;
- mutex_unlock(&inode->ei_update_lock);
+ mutex_lock(&h->inode->ei_update_lock);
+ ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h);
+ mutex_unlock(&h->inode->ei_update_lock);
return ret;
}
+static inline struct i_sectors_hook
+i_sectors_hook_init(struct bch_inode_info *inode, unsigned flags)
+{
+ return (struct i_sectors_hook) {
+ .hook.fn = i_sectors_hook_fn,
+ .inode = inode,
+ .sectors = 0,
+ .new_i_size = U64_MAX,
+ .flags = flags|BCH_INODE_I_SECTORS_DIRTY,
+ };
+}
+
+/* normal i_size/i_sectors update machinery: */
+
struct bchfs_extent_trans_hook {
struct bchfs_write_op *op;
struct extent_insert_hook hook;
BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
/* XXX: inode->i_size locking */
- if (offset > inode->ei_size) {
- BUG_ON(inode->ei_flags & BCH_INODE_I_SIZE_DIRTY);
-
+ if (offset > inode->ei_inode.bi_size) {
if (!h->need_inode_update) {
h->need_inode_update = true;
return BTREE_INSERT_NEED_TRAVERSE;
}
+ BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+
h->inode_u.bi_size = offset;
do_pack = true;
- inode->ei_size = offset;
+ inode->ei_inode.bi_size = offset;
if (h->op->is_dio)
i_size_write(&inode->v, offset);
h->inode_u.bi_sectors += sectors;
do_pack = true;
- atomic64_add(sectors, &inode->ei_sectors);
-
h->op->sectors_added += sectors;
-
- if (h->op->is_dio) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += sectors;
- spin_unlock(&inode->v.i_lock);
- }
}
if (do_pack)
struct btree_iter extent_iter, inode_iter;
struct bchfs_extent_trans_hook hook;
struct bkey_i *k = bch2_keylist_front(keys);
+ s64 orig_sectors_added = op->sectors_added;
int ret;
BUG_ON(k->k.p.inode != op->inode->v.i_ino);
/* XXX: inode->i_size locking */
k = bch2_keylist_front(keys);
- if (min(k->k.p.offset << 9, op->new_i_size) > op->inode->ei_size)
+ if (min(k->k.p.offset << 9, op->new_i_size) >
+ op->inode->ei_inode.bi_size)
hook.need_inode_update = true;
if (hook.need_inode_update) {
bch2_btree_iter_unlock(&extent_iter);
bch2_btree_iter_unlock(&inode_iter);
+ if (op->is_dio)
+ i_sectors_acct(wop->c, op->inode,
+ op->sectors_added - orig_sectors_added);
+
return ret;
}
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+ struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct bch_io_opts opts,
+ bool is_dio)
+{
+ op->inode = inode;
+ op->sectors_added = 0;
+ op->is_dio = is_dio;
+ op->unalloc = false;
+ op->new_i_size = U64_MAX;
+
+ bch2_write_op_init(&op->op, c);
+ op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+ op->op.devs = c->fastest_devs;
+ op->op.index_update_fn = bchfs_write_index_update;
+ op_journal_seq_set(&op->op, &inode->ei_journal_seq);
+}
+
+static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode->ei_inode));
+ return opts;
+}
+
/* page state: */
/* stored in page->private: */
s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
ClearPagePrivate(page);
- if (s.dirty_sectors) {
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks -= s.dirty_sectors;
- spin_unlock(&inode->v.i_lock);
- }
+ if (s.dirty_sectors)
+ i_sectors_acct(c, inode, -s.dirty_sectors);
if (s.reserved)
bch2_disk_reservation_put(c, &res);
int bch2_set_page_dirty(struct page *page)
{
+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state old, new;
old = page_state_cmpxchg(page_state(page), new,
new.dirty_sectors = PAGE_SECTORS - new.sectors;
);
- if (old.dirty_sectors != new.dirty_sectors) {
- struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
-
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += new.dirty_sectors - old.dirty_sectors;
- spin_unlock(&inode->v.i_lock);
- }
+ if (old.dirty_sectors != new.dirty_sectors)
+ i_sectors_acct(c, inode, new.dirty_sectors - old.dirty_sectors);
return __set_page_dirty_nobuffers(page);
}
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- if (!bio->bi_error) {
+ if (!bio->bi_status) {
SetPageUptodate(page);
} else {
ClearPageUptodate(page);
{
struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts = io_opts(c, inode);
struct btree_iter iter;
struct page *page;
struct readpages_iter readpages_iter = {
c->sb.encoded_extent_max >> PAGE_SECTOR_SHIFT);
struct bch_read_bio *rbio =
- to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));
+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
+ opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
bio_add_page_contig(&rbio->bio, page);
{
struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_io_opts opts = io_opts(c, inode);
struct bch_read_bio *rbio;
- rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
rbio->bio.bi_end_io = bch2_readpages_end_io;
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
struct bch_writepage_state {
struct bch_writepage_io *io;
+ struct bch_io_opts opts;
};
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+ struct bch_inode_info *inode)
+{
+ return (struct bch_writepage_state) { .opts = io_opts(c, inode) };
+}
+
static void bch2_writepage_io_free(struct closure *cl)
{
struct bch_writepage_io *io = container_of(cl,
* PageWriteback is effectively our ref on the inode - fixup i_blocks
* before calling end_page_writeback:
*/
- if (io->op.sectors_added) {
- struct bch_inode_info *inode = io->op.inode;
-
- spin_lock(&inode->v.i_lock);
- inode->v.i_blocks += io->op.sectors_added;
- spin_unlock(&inode->v.i_lock);
- }
+ if (io->op.sectors_added)
+ i_sectors_acct(c, io->op.inode, io->op.sectors_added);
bio_for_each_segment_all(bvec, bio, i)
end_page_writeback(bvec->bv_page);
w->io = NULL;
atomic_add(bio->bi_vcnt, &io->op.op.c->writeback_pages);
- io->op.op.pos.offset = bio->bi_iter.bi_sector;
-
closure_call(&io->op.op.cl, bch2_write, NULL, &io->cl);
continue_at(&io->cl, bch2_writepage_io_done, NULL);
}
static void bch2_writepage_io_alloc(struct bch_fs *c,
struct bch_writepage_state *w,
struct bch_inode_info *inode,
- struct page *page)
-{
- u64 inum = inode->v.i_ino;
- unsigned nr_replicas = page_state(page)->nr_replicas;
-
- EBUG_ON(!nr_replicas);
- /* XXX: disk_reservation->gen isn't plumbed through */
-
- if (!w->io) {
-alloc_io:
- w->io = container_of(bio_alloc_bioset(GFP_NOFS,
- BIO_MAX_PAGES,
- &c->writepage_bioset),
- struct bch_writepage_io, op.op.wbio.bio);
-
- closure_init(&w->io->cl, NULL);
- bch2_fswrite_op_init(&w->io->op, inode, false);
- bch2_write_op_init(&w->io->op.op, c,
- (struct disk_reservation) {
- .nr_replicas = c->opts.data_replicas,
- },
- c->fastest_devs,
- writepoint_hashed(inode->ei_last_dirtied),
- POS(inum, 0),
- &inode->ei_journal_seq,
- 0);
- w->io->op.op.index_update_fn = bchfs_write_index_update;
- }
+ struct page *page,
+ struct bch_page_state s)
+{
+ struct bch_write_op *op;
+ u64 offset = (u64) page->index << PAGE_SECTOR_SHIFT;
- if (w->io->op.op.res.nr_replicas != nr_replicas ||
- bio_add_page_contig(&w->io->op.op.wbio.bio, page)) {
- bch2_writepage_do_io(w);
- goto alloc_io;
- }
+ w->io = container_of(bio_alloc_bioset(GFP_NOFS,
+ BIO_MAX_PAGES,
+ &c->writepage_bioset),
+ struct bch_writepage_io, op.op.wbio.bio);
+ op = &w->io->op.op;
- /*
- * We shouldn't ever be handed pages for multiple inodes in a single
- * pass - right?
- */
- BUG_ON(inode != w->io->op.inode);
+ closure_init(&w->io->cl, NULL);
+
+ bch2_fswrite_op_init(&w->io->op, c, inode, w->opts, false);
+ op->nr_replicas = s.nr_replicas;
+ op->res.nr_replicas = s.nr_replicas;
+ op->write_point = writepoint_hashed(inode->ei_last_dirtied);
+ op->pos = POS(inode->v.i_ino, offset);
+ op->wbio.bio.bi_iter.bi_sector = offset;
}
static int __bch2_writepage(struct bch_fs *c, struct page *page,
*/
zero_user_segment(page, offset, PAGE_SIZE);
do_io:
- bch2_writepage_io_alloc(c, w, inode, page);
-
- /* while page is locked: */
- w->io->op.new_i_size = i_size;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
-
/* Before unlocking the page, transfer reservation to w->io: */
old = page_state_cmpxchg(page_state(page), new, {
EBUG_ON(!new.reserved &&
(new.sectors != PAGE_SECTORS ||
!new.allocated));
- if (new.allocated &&
- w->io->op.op.compression_type != BCH_COMPRESSION_NONE)
+ if (new.allocated && w->opts.compression)
new.allocated = 0;
else if (!new.reserved)
- goto out;
+ break;
new.reserved = 0;
});
- w->io->op.op.res.sectors += PAGE_SECTORS *
- (old.reserved - new.reserved) *
- old.nr_replicas;
-out:
+ if (w->io &&
+ (w->io->op.op.res.nr_replicas != old.nr_replicas ||
+ !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
+ bch2_writepage_do_io(w);
+
+ if (!w->io)
+ bch2_writepage_io_alloc(c, w, inode, page, old);
+
+ BUG_ON(inode != w->io->op.inode);
+ BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
+
+ if (old.reserved)
+ w->io->op.op.res.sectors += old.nr_replicas * PAGE_SECTORS;
+
+ /* while page is locked: */
+ w->io->op.new_i_size = i_size;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ w->io->op.op.wbio.bio.bi_opf |= REQ_SYNC;
+
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
+ struct bch_writepage_state w =
+ bch_writepage_state_init(c, to_bch_ei(mapping->host));
struct pagecache_iter iter;
struct page *page;
int ret = 0;
int bch2_writepage(struct page *page, struct writeback_control *wbc)
{
struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w = { NULL };
+ struct bch_writepage_state w =
+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
int ret;
ret = __bch2_writepage(c, page, wbc, &w);
__bchfs_readpage(c, rbio, inode->v.i_ino, page);
wait_for_completion(&done);
- ret = rbio->bio.bi_error;
+ ret = blk_status_to_errno(rbio->bio.bi_status);
bio_put(&rbio->bio);
if (ret < 0)
{
struct dio_read *dio = bio->bi_private;
- if (bio->bi_error)
- dio->ret = bio->bi_error;
+ if (bio->bi_status)
+ dio->ret = blk_status_to_errno(bio->bi_status);
closure_put(&dio->cl);
}
struct file *file, struct bch_inode_info *inode,
struct iov_iter *iter, loff_t offset)
{
+ struct bch_io_opts opts = io_opts(c, inode);
struct dio_read *dio;
struct bio *bio;
bool sync = is_sync_kiocb(req);
ret = bio_iov_iter_get_pages(bio, iter);
if (ret < 0) {
/* XXX: fault inject this path */
- bio->bi_error = ret;
+ bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio);
break;
}
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, to_rbio(bio), inode->v.i_ino);
+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
}
if (sync) {
struct file *file = dio->req->ki_filp;
struct address_space *mapping = file->f_mapping;
struct bch_inode_info *inode = file_bch_inode(file);
- long ret = dio->error ?: dio->written;
+ long ret = dio->iop.op.error ?: ((long) dio->iop.op.written << 9);
- bch2_disk_reservation_put(dio->c, &dio->res);
+ bch2_disk_reservation_put(dio->c, &dio->iop.op.res);
__pagecache_block_put(&mapping->add_lock);
inode_dio_end(&inode->v);
struct bio_vec *bv;
int i;
- dio->written += dio->iop.op.written << 9;
-
- if (dio->iop.op.error)
- dio->error = dio->iop.op.error;
-
bio_for_each_segment_all(bv, &dio->iop.op.wbio.bio, i)
put_page(bv->bv_page);
struct file *file = dio->req->ki_filp;
struct bch_inode_info *inode = file_bch_inode(file);
struct bio *bio = &dio->iop.op.wbio.bio;
- unsigned flags = 0;
int ret;
- if ((dio->req->ki_flags & IOCB_DSYNC) &&
- !dio->c->opts.journal_flush_disabled)
- flags |= BCH_WRITE_FLUSH;
-
ret = bio_iov_iter_get_pages(bio, &dio->iter);
if (ret < 0) {
- /*
- * these didn't get initialized, but bch2_dio_write_done() will
- * look at them:
- */
- dio->iop.op.error = 0;
- dio->iop.op.written = 0;
- dio->error = ret;
+ dio->iop.op.error = ret;
return;
}
- dio->iop.sectors_added = 0;
- bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
- dio->c->fastest_devs,
- writepoint_hashed((unsigned long) dio->task),
- POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
- &inode->ei_journal_seq,
- flags);
- dio->iop.op.index_update_fn = bchfs_write_index_update;
-
- if (!dio->iop.unalloc) {
- dio->res.sectors -= bio_sectors(bio);
- dio->iop.op.res.sectors = bio_sectors(bio);
- }
+ dio->iop.op.pos = POS(inode->v.i_ino, (dio->offset >> 9) + dio->iop.op.written);
task_io_account_write(bio->bi_iter.bi_size);
bch2_dio_write_done(dio);
- if (dio->iter.count && !dio->error) {
+ if (dio->iter.count && !dio->iop.op.error) {
use_mm(dio->task->mm);
pagecache_block_get(&mapping->add_lock);
}
}
-static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
- u64 size)
-{
- struct btree_iter iter;
- struct bpos end = pos;
- struct bkey_s_c k;
- int ret = 0;
-
- end.offset += size;
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
- BTREE_ITER_WITH_HOLES, k) {
- if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
- break;
-
- if (!bch2_extent_is_fully_allocated(k)) {
- ret = -ENOSPC;
- break;
- }
- }
- bch2_btree_iter_unlock(&iter);
-
- return ret;
-}
-
static int bch2_direct_IO_write(struct bch_fs *c,
struct kiocb *req, struct file *file,
struct bch_inode_info *inode,
closure_init(&dio->cl, NULL);
dio->req = req;
dio->c = c;
- dio->written = 0;
- dio->error = 0;
dio->offset = offset;
dio->iovec = NULL;
dio->iter = *iter;
dio->task = current;
- bch2_fswrite_op_init(&dio->iop, inode, true);
+ bch2_fswrite_op_init(&dio->iop, c, inode, io_opts(c, inode), true);
+ dio->iop.op.write_point = writepoint_hashed((unsigned long) dio->task);
+ dio->iop.op.flags |= BCH_WRITE_NOPUT_RESERVATION;
+
+ if ((dio->req->ki_flags & IOCB_DSYNC) &&
+ !c->opts.journal_flush_disabled)
+ dio->iop.op.flags |= BCH_WRITE_FLUSH;
if (offset + iter->count > inode->v.i_size)
sync = true;
* Have to then guard against racing with truncate (deleting data that
* we would have been overwriting)
*/
- ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
+ ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9, 0);
if (unlikely(ret)) {
if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
offset >> 9),
dio->iop.unalloc = true;
}
+ dio->iop.op.nr_replicas = dio->iop.op.res.nr_replicas;
+
inode_dio_begin(&inode->v);
__pagecache_block_get(&mapping->add_lock);
closure_sync(&dio->cl);
bch2_dio_write_done(dio);
- } while (dio->iter.count && !dio->error);
+ } while (dio->iter.count && !dio->iop.op.error);
closure_debug_destroy(&dio->cl);
return __bch2_dio_write_complete(dio);
} else {
bch2_do_direct_IO_write(dio);
- if (dio->iter.count && !dio->error) {
+ if (dio->iter.count && !dio->iop.op.error) {
if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
dio->iovec = kmalloc(dio->iter.nr_segs *
sizeof(struct iovec),
GFP_KERNEL);
if (!dio->iovec)
- dio->error = -ENOMEM;
+ dio->iop.op.error = -ENOMEM;
} else {
dio->iovec = dio->inline_vecs;
}
return bch2_journal_flush_seq(&c->journal, inode->ei_journal_seq);
}
-static int __bch2_truncate_page(struct address_space *mapping,
+static int __bch2_truncate_page(struct bch_inode_info *inode,
pgoff_t index, loff_t start, loff_t end)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct address_space *mapping = inode->v.i_mapping;
unsigned start_offset = start & (PAGE_SIZE - 1);
unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
struct page *page;
return ret;
}
-static int bch2_truncate_page(struct address_space *mapping, loff_t from)
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
{
- return __bch2_truncate_page(mapping, from >> PAGE_SHIFT,
- from, from + PAGE_SIZE);
+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
+ from, from + PAGE_SIZE);
}
int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct address_space *mapping = inode->v.i_mapping;
bool shrink = iattr->ia_size <= inode->v.i_size;
+ struct i_sectors_hook i_sectors_hook =
+ i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
int ret = 0;
inode_dio_wait(&inode->v);
/* sync appends.. */
/* XXX what protects inode->i_size? */
- if (iattr->ia_size > inode->ei_size)
+ if (iattr->ia_size > inode->ei_inode.bi_size)
ret = filemap_write_and_wait_range(mapping,
- inode->ei_size, S64_MAX);
+ inode->ei_inode.bi_size, S64_MAX);
if (ret)
goto err_put_pagecache;
- mutex_lock(&inode->ei_update_lock);
- i_size_dirty_get(inode);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
+ i_sectors_hook.new_i_size = iattr->ia_size;
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
goto err;
* here (new i_size < current i_size):
*/
if (shrink) {
- struct i_sectors_hook i_sectors_hook;
- int ret;
-
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = bch2_truncate_page(inode, iattr->ia_size);
if (unlikely(ret))
goto err;
- ret = bch2_truncate_page(inode->v.i_mapping, iattr->ia_size);
- if (unlikely(ret)) {
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
- goto err;
- }
-
ret = bch2_inode_truncate(c, inode->v.i_ino,
- round_up(iattr->ia_size, PAGE_SIZE) >> 9,
- &i_sectors_hook.hook,
- &inode->ei_journal_seq);
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
-
+ round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+ &i_sectors_hook.hook,
+ &inode->ei_journal_seq);
if (unlikely(ret))
goto err;
}
- mutex_lock(&inode->ei_update_lock);
setattr_copy(&inode->v, iattr);
- inode->v.i_mtime = inode->v.i_ctime = current_fs_time(inode->v.i_sb);
-out:
- /* clear I_SIZE_DIRTY: */
- i_size_dirty_put(inode);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
+ inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+err:
+ /*
+ * On error - in particular, bch2_truncate_page() error - don't clear
+ * I_SIZE_DIRTY, as we've left data above i_size!:
+ */
+ if (ret)
+ i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err_put_pagecache:
pagecache_block_put(&mapping->add_lock);
return ret;
-err:
- mutex_lock(&inode->ei_update_lock);
- goto out;
}
static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
offset, offset + len);
if (unlikely(ret))
- goto out;
+ goto err;
if (offset >> PAGE_SHIFT !=
(offset + len) >> PAGE_SHIFT) {
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
(offset + len) >> PAGE_SHIFT,
offset, offset + len);
if (unlikely(ret))
- goto out;
+ goto err;
}
truncate_pagecache_range(&inode->v, offset, offset + len - 1);
if (discard_start < discard_end) {
struct disk_reservation disk_res;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook =
+ i_sectors_hook_init(inode, 0);
int ret;
- BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
-
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
- goto out;
+ goto err;
+
+ /*
+ * We need to pass in a disk reservation here because we might
+ * be splitting a compressed extent into two. This isn't a
+ * problem with truncate because truncate will never split an
+ * extent, only truncate it...
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, 0, 0);
+ BUG_ON(ret);
ret = bch2_btree_delete_range(c,
BTREE_ID_EXTENTS,
&disk_res,
&i_sectors_hook.hook,
&inode->ei_journal_seq);
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
bch2_disk_reservation_put(c, &disk_res);
+
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
}
-out:
+err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
struct btree_iter dst;
BKEY_PADDED(k) copy;
struct bkey_s_c k;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
loff_t new_size;
int ret;
if (ret)
goto err;
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (ret)
goto err;
BTREE_INSERT_ENTRY(&dst, ©.k));
bch2_disk_reservation_put(c, &disk_res);
btree_iter_err:
- if (ret < 0 && ret != -EINTR)
- goto err_unwind;
+ if (ret == -EINTR)
+ ret = 0;
+ if (ret)
+ goto err_put_sectors_dirty;
+ /*
+ * XXX: if we error here we've left data with multiple
+ * pointers... which isn't a _super_ serious problem...
+ */
bch2_btree_iter_cond_resched(&src);
}
&i_sectors_hook.hook,
&inode->ei_journal_seq);
if (ret)
- goto err_unwind;
-
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ goto err_put_sectors_dirty;
- mutex_lock(&inode->ei_update_lock);
i_size_write(&inode->v, new_size);
- ret = bch2_write_inode_size(c, inode, inode->v.i_size);
- mutex_unlock(&inode->ei_update_lock);
-
+ i_sectors_hook.new_i_size = new_size;
+err_put_sectors_dirty:
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
+err:
pagecache_block_put(&mapping->add_lock);
inode_unlock(&inode->v);
- return ret;
-err_unwind:
- /*
- * XXX: we've left data with multiple pointers... which isn't a _super_
- * serious problem...
- */
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
-err:
bch2_btree_iter_unlock(&src);
bch2_btree_iter_unlock(&dst);
- pagecache_block_put(&mapping->add_lock);
- inode_unlock(&inode->v);
return ret;
}
{
struct address_space *mapping = inode->v.i_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct i_sectors_hook i_sectors_hook;
+ struct i_sectors_hook i_sectors_hook = i_sectors_hook_init(inode, 0);
struct btree_iter iter;
- struct bpos end;
+ struct bpos end_pos;
loff_t block_start, block_end;
- loff_t new_size = offset + len;
+ loff_t end = offset + len;
unsigned sectors;
unsigned replicas = READ_ONCE(c->opts.data_replicas);
int ret;
inode_dio_wait(&inode->v);
pagecache_block_get(&mapping->add_lock);
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->v.i_size) {
- ret = inode_newsize_ok(&inode->v, new_size);
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
+ ret = inode_newsize_ok(&inode->v, end);
if (ret)
goto err;
}
if (mode & FALLOC_FL_ZERO_RANGE) {
- ret = __bch2_truncate_page(mapping,
+ ret = __bch2_truncate_page(inode,
offset >> PAGE_SHIFT,
- offset, offset + len);
+ offset, end);
if (!ret &&
- offset >> PAGE_SHIFT !=
- (offset + len) >> PAGE_SHIFT)
- ret = __bch2_truncate_page(mapping,
- (offset + len) >> PAGE_SHIFT,
- offset, offset + len);
+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
+ ret = __bch2_truncate_page(inode,
+ end >> PAGE_SHIFT,
+ offset, end);
if (unlikely(ret))
goto err;
- truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+ truncate_pagecache_range(&inode->v, offset, end - 1);
block_start = round_up(offset, PAGE_SIZE);
- block_end = round_down(offset + len, PAGE_SIZE);
+ block_end = round_down(end, PAGE_SIZE);
} else {
block_start = round_down(offset, PAGE_SIZE);
- block_end = round_up(offset + len, PAGE_SIZE);
+ block_end = round_up(end, PAGE_SIZE);
}
bch2_btree_iter_set_pos(&iter, POS(inode->v.i_ino, block_start >> 9));
- end = POS(inode->v.i_ino, block_end >> 9);
+ end_pos = POS(inode->v.i_ino, block_end >> 9);
- ret = i_sectors_dirty_get(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_start(c, &i_sectors_hook);
if (unlikely(ret))
goto err;
- while (bkey_cmp(iter.pos, end) < 0) {
+ while (bkey_cmp(iter.pos, end_pos) < 0) {
struct disk_reservation disk_res = { 0 };
struct bkey_i_reservation reservation;
struct bkey_s_c k;
reservation.k.size = k.k->size;
bch2_cut_front(iter.pos, &reservation.k_i);
- bch2_cut_back(end, &reservation.k);
+ bch2_cut_back(end_pos, &reservation.k);
sectors = reservation.k.size;
reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
}
bch2_btree_iter_unlock(&iter);
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > inode->v.i_size) {
- i_size_write(&inode->v, new_size);
+ end > inode->v.i_size) {
+ i_size_write(&inode->v, end);
mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, inode->v.i_size);
/* blech */
if ((mode & FALLOC_FL_KEEP_SIZE) &&
(mode & FALLOC_FL_ZERO_RANGE) &&
- inode->ei_size != inode->v.i_size) {
+ inode->ei_inode.bi_size != inode->v.i_size) {
/* sync appends.. */
ret = filemap_write_and_wait_range(mapping,
- inode->ei_size, S64_MAX);
+ inode->ei_inode.bi_size, S64_MAX);
if (ret)
goto err;
- if (inode->ei_size != inode->v.i_size) {
+ if (inode->ei_inode.bi_size != inode->v.i_size) {
mutex_lock(&inode->ei_update_lock);
ret = bch2_write_inode_size(c, inode, inode->v.i_size);
mutex_unlock(&inode->ei_update_lock);
return 0;
err_put_sectors_dirty:
- i_sectors_dirty_put(c, inode, &i_sectors_hook);
+ ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
err:
bch2_btree_iter_unlock(&iter);
pagecache_block_put(&mapping->add_lock);
int bch2_fs_fsio_init(struct bch_fs *c)
{
if (bioset_init(&c->writepage_bioset,
- 4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+ 4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
+ BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_read_bioset,
- 4, offsetof(struct dio_read, rbio.bio)) ||
+ 4, offsetof(struct dio_read, rbio.bio),
+ BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_write_bioset,
- 4, offsetof(struct dio_write, iop.op.wbio.bio)))
+ 4, offsetof(struct dio_write, iop.op.wbio.bio),
+ BIOSET_NEED_BVECS))
return -ENOMEM;
return 0;
/* Set VFS inode flags from bcachefs inode: */
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{
- set_flags(bch_flags_to_vfs, inode->ei_flags, inode->v.i_flags);
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
}
static int bch2_inode_flags_set(struct bch_inode_info *inode,
return -EINVAL;
bi->bi_flags = newflags;
- inode->v.i_ctime = current_fs_time(inode->v.i_sb);
+ inode->v.i_ctime = current_time(&inode->v);
return 0;
}
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_flags);
+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg);
}
{
struct fsxattr fa = { 0 };
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_flags);
+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
return copy_to_user(arg, &fa, sizeof(fa));
}
#include "fs-ioctl.h"
#include "fsck.h"
#include "inode.h"
+#include "io.h"
#include "journal.h"
#include "keylist.h"
#include "super.h"
BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
} while (ret == -EINTR);
- if (!ret) {
- inode->ei_size = inode_u.bi_size;
- inode->ei_flags = inode_u.bi_flags;
- }
+ if (!ret)
+ inode->ei_inode = inode_u;
out:
bch2_btree_iter_unlock(&iter);
return __bch2_write_inode(c, inode, NULL, NULL);
}
-int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_inc_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret;
return ret;
}
-int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
+static int bch2_dec_nlink(struct bch_fs *c, struct bch_inode_info *inode)
{
int ret = 0;
bch2_inode_init(c, &inode_u,
i_uid_read(&inode->v),
i_gid_read(&inode->v),
- inode->v.i_mode, rdev);
+ inode->v.i_mode, rdev,
+ &dir->ei_inode);
+
ret = bch2_inode_create(c, &inode_u,
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (unlikely(ret))
return ret;
- dir->v.i_mtime = dir->v.i_ctime = current_fs_time(c->vfs_sb);
+ dir->v.i_mtime = dir->v.i_ctime = current_time(&dir->v);
mark_inode_dirty_sync(&dir->v);
return 0;
}
lockdep_assert_held(&inode->v.i_rwsem);
- inode->v.i_ctime = current_fs_time(dir->v.i_sb);
+ inode->v.i_ctime = current_time(&dir->v);
ret = bch2_inc_nlink(c, inode);
if (ret)
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_fs_time(old_dir->v.i_sb);
+ struct timespec now = current_time(&old_dir->v);
int ret;
lockdep_assert_held(&old_dir->v.i_rwsem);
{
struct bch_inode_info *old_inode = to_bch_ei(old_dentry->d_inode);
struct bch_inode_info *new_inode = to_bch_ei(new_dentry->d_inode);
- struct timespec now = current_fs_time(old_dir->v.i_sb);
+ struct timespec now = current_time(&old_dir->v);
int ret;
ret = bch2_dirent_rename(c,
inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
inode->ei_journal_seq = 0;
- inode->ei_size = bi->bi_size;
- inode->ei_flags = bi->bi_flags;
- atomic64_set(&inode->ei_sectors, bi->bi_sectors);
inode->ei_str_hash = bch2_hash_info_init(c, bi);
+ inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
inode_init_once(&inode->v);
mutex_init(&inode->ei_update_lock);
inode->ei_journal_seq = 0;
- atomic_long_set(&inode->ei_size_dirty_count, 0);
- atomic_long_set(&inode->ei_sectors_dirty_count, 0);
return &inode->v;
}
truncate_inode_pages_final(&inode->v.i_data);
- if (!bch2_journal_error(&c->journal) && !is_bad_inode(&inode->v)) {
- /* XXX - we want to check this stuff iff there weren't IO errors: */
- BUG_ON(atomic_long_read(&inode->ei_sectors_dirty_count));
- BUG_ON(atomic64_read(&inode->ei_sectors) != inode->v.i_blocks);
- }
-
clear_inode(&inode->v);
if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
sb->s_magic = BCACHEFS_STATFS_MAGIC;
sb->s_time_gran = c->sb.time_precision;
c->vfs_sb = sb;
- sb->s_bdi = &c->bdi;
strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
+ ret = super_setup_bdi(sb);
+ if (ret)
+ goto err_put_super;
+
+ sb->s_bdi->congested_fn = bch2_congested;
+ sb->s_bdi->congested_data = c;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+
for_each_online_member(ca, c, i) {
struct block_device *bdev = ca->disk_sb.bdev;
#ifndef _BCACHEFS_FS_H
#define _BCACHEFS_FS_H
+#include "opts.h"
#include "str_hash.h"
#include <linux/seqlock.h>
struct mutex ei_update_lock;
u64 ei_journal_seq;
-
- atomic_long_t ei_size_dirty_count;
-
- /*
- * these are updated whenever we update the inode in the btree - for
- * e.g. fsync
- */
- u64 ei_size;
- u32 ei_flags;
-
- atomic_long_t ei_sectors_dirty_count;
- atomic64_t ei_sectors;
+ unsigned long ei_last_dirtied;
struct bch_hash_info ei_str_hash;
- unsigned long ei_last_dirtied;
+ /* copy of inode in btree: */
+ struct bch_inode_unpacked ei_inode;
};
#define to_bch_ei(_inode) \
"hash table key at wrong offset: %llu, "
"hashed to %llu chain starts at %llu\n%s",
k.k->p.offset, hashed, h->chain.pos.offset,
- bch2_bkey_val_to_text(c, desc.btree_id,
+ bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
if (ret) {
if (fsck_err_on(k2.k->type == desc.key_type &&
!desc.cmp_bkey(k, k2), c,
"duplicate hash table keys:\n%s",
- bch2_bkey_val_to_text(c, desc.btree_id,
+ bch2_bkey_val_to_text(c, bkey_type(0, desc.btree_id),
buf, sizeof(buf), k))) {
ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
if (ret)
if (fsck_err_on(have_target &&
d.v->d_type !=
- mode_to_type(le16_to_cpu(target.bi_mode)), c,
+ mode_to_type(target.bi_mode), c,
"incorrect d_type: should be %u:\n%s",
- mode_to_type(le16_to_cpu(target.bi_mode)),
+ mode_to_type(target.bi_mode),
bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
buf, sizeof(buf), k))) {
struct bkey_i_dirent *n;
}
bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = mode_to_type(le16_to_cpu(target.bi_mode));
+ n->v.d_type = mode_to_type(target.bi_mode);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL,
fsck_err:
return ret;
create_root:
- bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+ 0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed, root_inode);
if (ret)
return ret;
- bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO,
+ 0, root_inode);
ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (bch2_inode_unpack(inode, &unpacked))
return "invalid variable length fields";
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+ return "invalid data checksum type";
+
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+ return "invalid data checksum type";
+
return NULL;
}
case BCH_INODE_BLOCKDEV:
static void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
+ char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
struct bch_inode_unpacked unpacked;
case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k);
if (bch2_inode_unpack(inode, &unpacked)) {
- scnprintf(buf, size, "(unpack error)");
+ out += scnprintf(out, end - out, "(unpack error)");
break;
}
- scnprintf(buf, size, "i_size %llu", unpacked.bi_size);
+#define BCH_INODE_FIELD(_name, _bits) \
+ out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
+ BCH_INODE_FIELDS()
+#undef BCH_INODE_FIELD
break;
}
}
};
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
- uid_t uid, gid_t gid, umode_t mode, dev_t rdev)
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
+ struct bch_inode_unpacked *parent)
{
- s64 now = timespec_to_bch2_time(c, CURRENT_TIME);
+ s64 now = timespec_to_bch2_time(c,
+ timespec_trunc(current_kernel_time(),
+ c->sb.time_precision));
memset(inode_u, 0, sizeof(*inode_u));
inode_u->bi_mtime = now;
inode_u->bi_ctime = now;
inode_u->bi_otime = now;
+
+ if (parent) {
+#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
+ BCH_INODE_FIELDS_INHERIT()
+#undef BCH_INODE_FIELD
+ }
}
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
struct bch_inode_unpacked inode_u;
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
- bi_generation = cpu_to_le32(inode_u.bi_generation) + 1;
+ bi_generation = inode_u.bi_generation + 1;
break;
}
case BCH_INODE_GENERATION: {
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
+#include "opts.h"
+
#include <linux/math64.h>
extern const struct bkey_ops bch2_bkey_inode_ops;
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
- uid_t, gid_t, umode_t, dev_t);
+ uid_t, gid_t, umode_t, dev_t,
+ struct bch_inode_unpacked *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_truncate(struct bch_fs *, u64, u64,
return div_s64(ns, c->sb.time_precision);
}
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
+{
+ struct bch_io_opts ret = { 0 };
+
+#define BCH_INODE_OPT(_name, _bits) \
+ if (inode->bi_##_name) \
+ opt_set(ret, _name, inode->bi_##_name - 1);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id, u64 v)
+{
+ switch (id) {
+#define BCH_INODE_OPT(_name, ...) \
+ case Opt_##_name: \
+ inode->bi_##_name = v; \
+ break;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ default:
+ BUG();
+ }
+}
+
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id, u64 v)
+{
+ return __bch2_inode_opt_set(inode, id, v + 1);
+}
+
+static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
+ enum bch_opt_id id)
+{
+ return __bch2_inode_opt_set(inode, id, 0);
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "super.h"
#include "super-io.h"
#include <linux/blkdev.h>
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
- unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
!c->devs[ptr->dev]);
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr + 1 < &extent_entry_last(e)->ptr) {
n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
n->c = c;
n->ca = ca;
- n->ptr_idx = ptr_idx++;
n->submit_time_us = local_clock_us();
n->bio.bi_iter.bi_sector = ptr->offset;
submit_bio(&n->bio);
} else {
n->have_io_ref = false;
- bcache_io_error(c, &n->bio, "device has been removed");
+ n->bio.bi_status = BLK_STS_REMOVED;
bio_endio(&n->bio);
}
}
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&op->c->journal);
- bch2_disk_reservation_put(op->c, &op->res);
+ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+ bch2_disk_reservation_put(op->c, &op->res);
percpu_ref_put(&op->c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
+ op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
+
closure_return(cl);
}
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
+ struct bkey_s_extent e;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *src, *dst = keys->keys, *n;
+ int ret;
op->flags |= BCH_WRITE_LOOPED;
+ for (src = keys->keys; src != keys->top; src = n) {
+ n = bkey_next(src);
+ bkey_copy(dst, src);
+
+ e = bkey_i_to_s_extent(dst);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (test_bit(ptr->dev, op->failed.d))
+ bch2_extent_drop_ptr(e, ptr);
+
+ ret = bch2_extent_nr_ptrs(e.c)
+ ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+ : -EIO;
+ if (ret) {
+ keys->top = keys->keys;
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ goto err;
+ }
+
+ dst = bkey_next(dst);
+ }
+
+ keys->top = dst;
+
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
op->error = ret;
}
}
-
+err:
bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
if (!(op->flags & BCH_WRITE_DONE))
}
}
-static void bch2_write_io_error(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct keylist *keys = &op->insert_keys;
- struct bch_fs *c = op->c;
- struct bch_extent_ptr *ptr;
- struct bkey_i *k;
- int ret;
-
- for_each_keylist_key(keys, k) {
- struct bkey_i *n = bkey_next(k);
- struct bkey_s_extent e = bkey_i_to_s_extent(k);
-
- extent_for_each_ptr_backwards(e, ptr)
- if (test_bit(ptr->dev, op->failed.d))
- bch2_extent_drop_ptr(e, ptr);
-
- memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
- keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
-
- ret = bch2_extent_nr_ptrs(e.c)
- ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
- : -EIO;
- if (ret) {
- keys->top = keys->keys;
- op->error = ret;
- op->flags |= BCH_WRITE_DONE;
- break;
- }
- }
-
- memset(&op->failed, 0, sizeof(op->failed));
-
- bch2_write_index(cl);
- return;
-}
-
static void bch2_write_endio(struct bio *bio)
{
struct closure *cl = bio->bi_private;
bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
- if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
set_bit(ca->dev_idx, op->failed.d);
- set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
- }
if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
- ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
- BCH_DATA_USER);
- if (ret)
- goto err;
-
dst->bi_end_io = bch2_write_endio;
dst->bi_private = &op->cl;
bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
!percpu_ref_tryget(&c->writes)) {
__bcache_io_error(c, "read only");
op->error = -EROFS;
- bch2_disk_reservation_put(c, &op->res);
+ if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
+ bch2_disk_reservation_put(c, &op->res);
closure_return(cl);
}
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
- __bch2_write_op_init(&op->write.op, c);
+ bch2_write_op_init(&op->write.op, c);
+ op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
+ op->write.op.compression_type =
+ bch2_compression_opt_to_type(rbio->opts.compression);
op->write.move_dev = -1;
op->write.op.devs = c->fastest_devs;
if (rbio->split)
rbio = bch2_rbio_free(rbio);
else
- rbio->bio.bi_error = 0;
+ rbio->bio.bi_status = 0;
if (!(flags & BCH_READ_NODECODE))
flags |= BCH_READ_MUST_CLONE;
__bch2_read(c, rbio, iter, inode, &avoid, flags);
}
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+ blk_status_t error)
{
rbio->retry = retry;
return;
if (retry == READ_ERR) {
- bch2_rbio_parent(rbio)->bio.bi_error = error;
+ bch2_rbio_parent(rbio)->bio.bi_status = error;
bch2_rbio_done(rbio);
} else {
bch2_rbio_punt(rbio, bch2_rbio_retry,
*/
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, -EIO);
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
return;
}
rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, crc.csum_type);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
return;
decompression_err:
__bcache_io_error(c, "decompression error, inode %llu offset %llu",
rbio->pos.inode,
(u64) rbio->bvec_iter.bi_sector);
- bch2_rbio_error(rbio, READ_ERR, -EIO);
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
return;
}
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
+ if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_error(rbio, READ_RETRY, -EINTR);
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
else
- bch2_rbio_error(rbio, READ_ERR, -EINTR);
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
return;
}
rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split));
+ &c->bio_read_split),
+ orig->opts);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
split = true;
* lose the error)
*/
rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
- &c->bio_read_split));
+ &c->bio_read_split),
+ orig->opts);
rbio->bio.bi_iter = iter;
split = true;
} else {
bch2_read_endio(&rbio->bio);
ret = rbio->retry;
+ if (rbio->split)
+ rbio = bch2_rbio_free(rbio);
if (!ret)
bch2_rbio_done(rbio);
}
* possibly bigger than the memory that was
* originally allocated)
*/
- rbio->bio.bi_error = -EINTR;
+ rbio->bio.bi_status = BLK_STS_AGAIN;
bio_endio(&rbio->bio);
return;
}
case READ_RETRY:
goto retry;
case READ_ERR:
+ rbio->bio.bi_status = BLK_STS_IOERR;
bio_endio(&rbio->bio);
return;
};
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
+
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
+ BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
/* Internal: */
- BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7),
- BCH_WRITE_DONE = (1 << 8),
- BCH_WRITE_LOOPED = (1 << 9),
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 8),
+ BCH_WRITE_DONE = (1 << 9),
+ BCH_WRITE_LOOPED = (1 << 10),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
? op->journal_seq_p : &op->journal_seq;
}
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
+{
+ op->journal_seq_p = journal_seq;
+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+}
+
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
int bch2_write_index_default(struct bch_write_op *);
-static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
- op->csum_type = bch2_data_checksum_type(c);
+ op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type =
bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = 0;
op->index_update_fn = bch2_write_index_default;
}
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct disk_reservation res,
- struct bch_devs_mask *devs,
- struct write_point_specifier write_point,
- struct bpos pos,
- u64 *journal_seq, unsigned flags)
-{
- __bch2_write_op_init(op, c);
- op->flags = flags;
- op->nr_replicas = res.nr_replicas;
- op->pos = pos;
- op->res = res;
- op->devs = devs;
- op->write_point = write_point;
-
- if (journal_seq) {
- op->journal_seq_p = journal_seq;
- op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
- }
-}
-
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
struct extent_pick_ptr *pick,
unsigned flags)
{
- rbio->_state = 0;
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
}
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
u64 inode)
{
- rbio->_state = 0;
+ BUG_ON(rbio->_state);
__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE|
BCH_READ_USER_MAPPED);
}
-static inline struct bch_read_bio *rbio_init(struct bio *bio)
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+ struct bch_io_opts opts)
{
struct bch_read_bio *rbio = to_rbio(bio);
- rbio->_state = 0;
+ rbio->_state = 0;
+ rbio->promote = NULL;
+ rbio->opts = opts;
return rbio;
}
#include "buckets_types.h"
#include "extents_types.h"
#include "keylist_types.h"
+#include "opts.h"
#include "super_types.h"
#include <linux/llist.h>
struct promote_op *promote;
+ struct bch_io_opts opts;
+
struct work_struct work;
struct bio bio;
struct closure *cl;
};
- u8 ptr_idx;
- u8 replicas_failed;
+ struct bch_devs_list failed;
u8 order;
unsigned split:1,
struct bch_fs *c;
struct workqueue_struct *io_wq;
+ unsigned written; /* sectors */
u16 flags;
- u16 written; /* sectors */
s8 error;
unsigned csum_type:4;
* Given a journal entry we just read, add it to the list of journal entries to
* be replayed:
*/
-static int journal_entry_add(struct bch_fs *c, struct journal_list *jlist,
- struct jset *j)
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+ struct journal_list *jlist, struct jset *j)
{
struct journal_replay *i, *pos;
struct list_head *where;
__le64 last_seq;
int ret;
- mutex_lock(&jlist->lock);
-
last_seq = !list_empty(jlist->head)
? list_last_entry(jlist->head, struct journal_replay,
list)->j.last_seq
memcmp(j, &i->j, bytes), c,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
-
- ret = JOURNAL_ENTRY_ADD_OK;
- goto out;
+ goto found;
}
if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
goto out;
}
- memcpy(&i->j, j, bytes);
list_add(&i->list, where);
+ i->devs.nr = 0;
+ memcpy(&i->j, j, bytes);
+found:
+ if (!fsck_err_on(bch2_dev_list_has_dev(i->devs, ca->dev_idx),
+ c, "duplicate journal entries on same device"))
+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
ret = JOURNAL_ENTRY_ADD_OK;
out:
fsck_err:
- mutex_unlock(&jlist->lock);
return ret;
}
#define journal_entry_err_on(cond, c, msg, ...) \
((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
- int write)
+static int journal_entry_validate_entries(struct bch_fs *c, struct jset *j,
+ int write)
{
struct jset_entry *entry;
int ret = 0;
if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(j), c,
"journal entry extends past end of jset")) {
- j->u64s = cpu_to_le64((u64 *) entry - j->_data);
+ j->u64s = cpu_to_le32((u64 *) entry - j->_data);
break;
}
"invalid journal entry: last_seq > seq"))
j->last_seq = j->seq;
- return __journal_entry_validate(c, j, write);
+ return 0;
fsck_err:
return ret;
}
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- ret = journal_entry_add(c, jlist, j);
+ mutex_lock(&jlist->lock);
+ ret = journal_entry_add(c, ca, jlist, j);
+ mutex_unlock(&jlist->lock);
+
switch (ret) {
case JOURNAL_ENTRY_ADD_OK:
*entries_found = true;
for_each_jset_entry_type(entry, &i->j,
JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- seq = le64_to_cpu(entry->_data[0]);
+ struct jset_entry_blacklist *bl_entry =
+ container_of(entry, struct jset_entry_blacklist, entry);
+ seq = le64_to_cpu(bl_entry->seq);
bch_verbose(c, "blacklisting existing journal seq %llu", seq);
fsck_err_on(c->sb.clean && journal_has_keys(list), c,
"filesystem marked clean but journal has keys to replay");
+ list_for_each_entry(i, list, list) {
+ ret = journal_entry_validate_entries(c, &i->j, READ);
+ if (ret)
+ goto fsck_err;
+ }
+
i = list_last_entry(list, struct journal_replay, list);
unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
+ p->devs.nr = 0;
}
mutex_lock(&j->blacklist_lock);
p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1);
+ p->devs = i->devs;
if (journal_seq_blacklist_read(j, i, p)) {
mutex_unlock(&j->blacklist_lock);
{
struct journal_buf *w = journal_prev_buf(j);
- atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
+ atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
if (!need_write_just_set &&
test_bit(JOURNAL_NEED_WRITE, &j->flags))
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, count);
+ p->devs.nr = 0;
}
static void __bch2_journal_next_entry(struct journal *j)
bch2_journal_error(j));
}
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool flush;
+
+ bch2_journal_flush_pins(j, U64_MAX);
+
+ spin_lock(&j->lock);
+ flush = last_seq(j) != j->last_seq_ondisk ||
+ c->btree_roots_dirty;
+ spin_unlock(&j->lock);
+
+ return flush ? bch2_journal_meta(j) : 0;
+}
+
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
* i.e. whichever device was limiting the current journal entry size.
*/
extent_for_each_ptr_backwards(e, ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (ca->mi.state != BCH_MEMBER_STATE_RW ||
ca->journal.sectors_free <= sectors)
struct bch_dev *ca = bio->bi_private;
struct journal *j = &ca->fs->journal;
- if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
bch2_meta_write_fault("journal")) {
/* Was this a flush or an actual journal write? */
if (ca->journal.ptr_idx != U8_MAX) {
if (r->alive)
bch2_journal_add_btree_root(w, i, &r->key, r->level);
}
+ c->btree_roots_dirty = false;
mutex_unlock(&c->btree_root_lock);
journal_write_compact(jset);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- __journal_entry_validate(c, jset, WRITE))
+ journal_entry_validate_entries(c, jset, WRITE))
goto err;
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
journal_nonce(jset), jset);
if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
- __journal_entry_validate(c, jset, WRITE))
+ journal_entry_validate_entries(c, jset, WRITE))
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
BCH_DATA_JOURNAL))
goto err;
+ journal_seq_pin(j, le64_to_cpu(jset->seq))->devs =
+ bch2_extent_devs(bkey_i_to_s_c_extent(&j->key));
+
/*
* XXX: we really should just disable the entire journal in nochanges
* mode
goto no_io;
extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) {
- ca = c->devs[ptr->dev];
+ ca = bch_dev_bkey_exists(c, ptr->dev);
if (!percpu_ref_tryget(&ca->io_ref)) {
/* XXX: fix this */
bch_err(c, "missing device for journal write\n");
return bch2_journal_flush_seq(j, seq);
}
+int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct journal_entry_pin_list *p;
+ struct bch_devs_list devs;
+ u64 seq = 0;
+ unsigned iter;
+ int ret = 0;
+
+ spin_lock(&j->lock);
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
+ if (bch2_dev_list_has_dev(p->devs, dev_idx))
+ seq = journal_pin_seq(j, p);
+ spin_unlock(&j->lock);
+
+ bch2_journal_flush_pins(j, seq);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+ seq = 0;
+
+ spin_lock(&j->lock);
+ while (!ret && seq < atomic64_read(&j->seq)) {
+ seq = max(seq, last_seq(j));
+ devs = journal_seq_pin(j, seq)->devs;
+ seq++;
+
+ spin_unlock(&j->lock);
+ ret = bch2_check_mark_super_devlist(c, &devs, BCH_DATA_JOURNAL);
+ spin_lock(&j->lock);
+ }
+ spin_unlock(&j->lock);
+
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+}
+
ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
* journal entries, then force a brand new empty journal entry to be
* written:
*/
- bch2_journal_flush_pins(j, U64_MAX);
- bch2_journal_flush_async(j, NULL);
- bch2_journal_meta(j);
+ bch2_journal_flush_all_pins(j);
cancel_delayed_work_sync(&j->write_work);
cancel_delayed_work_sync(&j->reclaim_work);
*/
struct journal_replay {
struct list_head list;
+ struct bch_devs_list devs;
+ /* must be last: */
struct jset j;
};
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
struct closure;
struct bch_fs;
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
+int bch2_journal_flush_device(struct journal *, unsigned);
void bch2_journal_halt(struct journal *);
struct list_head list;
struct list_head flushed;
atomic_t count;
+ struct bch_devs_list devs;
};
struct journal;
#define MAX_DATA_OFF_ITER 10
-/*
- * This moves only the data off, leaving the meta-data (if any) in place.
- * It walks the key space, and for any key with a valid pointer to the
- * relevant device, it copies it elsewhere, updating the key to point to
- * the copy.
- * The meta-data is moved off by bch_move_meta_data_off_device.
- *
- * Note: If the number of data replicas desired is > 1, ideally, any
- * new copies would not be made in the same device that already have a
- * copy (if there are enough devices).
- * This is _not_ currently implemented. The multiple replicas can
- * land in the same device even if there are others available.
- */
-
-int bch2_move_data_off_device(struct bch_dev *ca)
+static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
struct btree_iter iter;
struct bkey_s_c k;
u64 keys_moved, sectors_moved;
return ret;
}
-/*
- * This walks the btree, and for any node on the relevant device it moves the
- * node elsewhere.
- */
static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
enum btree_id id)
{
* is written.
*/
-int bch2_move_metadata_off_device(struct bch_dev *ca)
+static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
+ int flags)
{
- struct bch_fs *c = ca->fs;
unsigned i;
int ret = 0;
return ret;
}
-/*
- * Flagging data bad when forcibly removing a device after failing to
- * migrate the data off the device.
- */
+int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
+{
+ return bch2_dev_usrdata_migrate(c, ca, flags) ?:
+ bch2_dev_metadata_migrate(c, ca, flags);
+}
-static int bch2_flag_key_bad(struct btree_iter *iter,
- struct bch_dev *ca,
- struct bkey_s_c_extent orig)
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
+ unsigned dev_idx, int flags, bool metadata)
{
- BKEY_PADDED(key) tmp;
- struct bkey_s_extent e;
struct bch_extent_ptr *ptr;
- struct bch_fs *c = ca->fs;
-
- bkey_reassemble(&tmp.key, orig.s_c);
- e = bkey_i_to_s_extent(&tmp.key);
+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
+ unsigned nr_good;
extent_for_each_ptr_backwards(e, ptr)
- if (ptr->dev == ca->dev_idx)
+ if (ptr->dev == dev_idx)
bch2_extent_drop_ptr(e, ptr);
- /*
- * If the new extent no longer has any pointers, bch2_extent_normalize()
- * will do the appropriate thing with it (turning it into a
- * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
- */
- bch2_extent_normalize(c, e.s);
+ nr_good = bch2_extent_nr_good_ptrs(c, e.c);
+ if ((!nr_good && !(flags & lost)) ||
+ (nr_good < replicas && !(flags & degraded)))
+ return -EINVAL;
- return bch2_btree_insert_at(c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
+ return 0;
}
/*
* that we've already tried to move the data MAX_DATA_OFF_ITER times and
* are not likely to succeed if we try again.
*/
-int bch2_flag_data_bad(struct bch_dev *ca)
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
- struct bch_fs *c = ca->fs;
struct bkey_s_c k;
- struct bkey_s_c_extent e;
+ struct bkey_s_extent e;
+ BKEY_PADDED(key) tmp;
struct btree_iter iter;
int ret = 0;
if (!bkey_extent_is_data(k.k))
goto advance;
- e = bkey_s_c_to_extent(k);
- if (!bch2_extent_has_device(e, ca->dev_idx))
+ if (!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx))
goto advance;
- ret = bch2_flag_key_bad(&iter, ca, e);
+ bkey_reassemble(&tmp.key, k);
+ e = bkey_i_to_s_extent(&tmp.key);
+
+ ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
+ if (ret)
+ break;
+
+ /*
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
+ * will do the appropriate thing with it (turning it into a
+ * KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
+ */
+ bch2_extent_normalize(c, e.s);
+
+ if (bkey_extent_is_data(e.k) &&
+ (ret = bch2_check_mark_super(c, e.c, BCH_DATA_USER)))
+ break;
+
+ iter.pos = bkey_start_pos(&tmp.key.k);
+
+ ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOFAIL,
+ BTREE_INSERT_ENTRY(&iter, &tmp.key));
/*
* don't want to leave ret == -EINTR, since if we raced and
if (ret)
break;
- /*
- * If the replica we're dropping was dirty and there is an
- * additional cached replica, the cached replica will now be
- * considered dirty - upon inserting the new version of the key,
- * the bucket accounting will be updated to reflect the fact
- * that the cached data is now dirty and everything works out as
- * if by magic without us having to do anything.
- *
- * The one thing we need to be concerned with here is there's a
- * race between when we drop any stale pointers from the key
- * we're about to insert, and when the key actually gets
- * inserted and the cached data is marked as dirty - we could
- * end up trying to insert a key with a pointer that should be
- * dirty, but points to stale data.
- *
- * If that happens the insert code just bails out and doesn't do
- * the insert - however, it doesn't return an error. Hence we
- * need to always recheck the current key before advancing to
- * the next:
- */
continue;
advance:
if (bkey_extent_is_data(k.k)) {
return ret;
}
+
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ struct btree_iter iter;
+ struct closure cl;
+ struct btree *b;
+ unsigned id;
+ int ret;
+
+ /* don't handle this yet: */
+ if (flags & BCH_FORCE_IF_METADATA_LOST)
+ return -EINVAL;
+
+ closure_init_stack(&cl);
+
+ mutex_lock(&c->replicas_gc_lock);
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+ for (id = 0; id < BTREE_ID_NR; id++) {
+ for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+ struct bkey_i_extent *new_key;
+retry:
+ if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
+ dev_idx)) {
+ bch2_btree_iter_set_locks_want(&iter, 0);
+
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key),
+ BCH_DATA_BTREE);
+ if (ret)
+ goto err;
+ } else {
+ bkey_copy(&tmp.k, &b->key);
+ new_key = bkey_i_to_extent(&tmp.k);
+
+ ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
+ dev_idx, flags, true);
+ if (ret)
+ goto err;
+
+ if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+
+ ret = bch2_btree_node_update_key(c, &iter, b, new_key);
+ if (ret == -EINTR) {
+ b = bch2_btree_iter_peek_node(&iter);
+ goto retry;
+ }
+ if (ret)
+ goto err;
+ }
+ }
+ bch2_btree_iter_unlock(&iter);
+
+ /* btree root */
+ mutex_lock(&c->btree_root_lock);
+ mutex_unlock(&c->btree_root_lock);
+ }
+
+ ret = 0;
+out:
+ bch2_replicas_gc_end(c, ret);
+ mutex_unlock(&c->replicas_gc_lock);
+
+ return ret;
+err:
+ bch2_btree_iter_unlock(&iter);
+ goto out;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+{
+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, dev_idx, flags);
+}
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_move_data_off_device(struct bch_dev *);
-int bch2_move_metadata_off_device(struct bch_dev *);
-int bch2_flag_data_bad(struct bch_dev *);
+int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
+#include "inode.h"
#include "io.h"
#include "move.h"
#include "super-io.h"
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
- if (likely(!io->rbio.bio.bi_error)) {
+ if (likely(!io->rbio.bio.bi_status)) {
bch2_migrate_write_init(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
struct write_point_specifier wp,
int btree_insert_flags,
int move_device,
+ struct bch_io_opts opts,
struct bkey_s_c k)
{
struct extent_pick_ptr pick;
goto err;
}
+ io->rbio.opts = opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
io->rbio.bio.bi_end_io = move_read_endio;
- __bch2_write_op_init(&io->write.op, c);
io->write.btree_insert_flags = btree_insert_flags;
io->write.move_dev = move_device;
+
+ bch2_write_op_init(&io->write.op, c);
+ io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
+ io->write.op.compression_type =
+ bch2_compression_opt_to_type(opts.compression);
io->write.op.devs = devs;
io->write.op.write_point = wp;
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt;
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
+ u64 cur_inum = U64_MAX;
int ret = 0;
bch2_move_ctxt_init(&ctxt);
(bch2_btree_iter_unlock(&iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
-
+peek:
k = bch2_btree_iter_peek(&iter);
if (!k.k)
break;
if (ret)
break;
- if (!bkey_extent_is_data(k.k) ||
- !pred(arg, bkey_s_c_to_extent(k)))
+ if (!bkey_extent_is_data(k.k))
+ goto next;
+
+ if (cur_inum != k.k->p.inode) {
+ struct bch_inode_unpacked inode;
+
+ /* don't hold btree locks while looking up inode: */
+ bch2_btree_iter_unlock(&iter);
+
+ opts = bch2_opts_to_inode_opts(c->opts);
+ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+ cur_inum = k.k->p.inode;
+ goto peek;
+ }
+
+ if (!pred(arg, bkey_s_c_to_extent(k)))
goto next;
/* unlock before doing IO: */
if (bch2_move_extent(c, &ctxt, devs, wp,
btree_insert_flags,
- move_device, k)) {
+ move_device, opts, k)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
#undef BCH_OPT
}
-u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
- return opts->_name; \
-
+ return opt_defined(*opts, _name);
BCH_OPTS()
#undef BCH_OPT
+ default:
+ BUG();
+ }
+}
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
+{
+ switch (id) {
+#define BCH_OPT(_name, ...) \
+ case Opt_##_name: \
+ return opts->_name;
+ BCH_OPTS()
+#undef BCH_OPT
default:
BUG();
}
case Opt_##_name: \
opt_set(*opts, _name, v); \
break;
-
BCH_OPTS()
#undef BCH_OPT
-
default:
BUG();
}
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
if (_sb_opt != NO_SB_OPT) \
opt_set(opts, _name, _sb_opt(sb));
-
BCH_OPTS()
#undef BCH_OPT
#undef BCH_OPT
};
-static int bch2_opt_lookup(const char *name)
+int bch2_opt_lookup(const char *name)
{
const struct bch_option *i;
pr_err("Mount option %s requires a value", name);
return -1;
}
+
+/* io opts: */
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
+{
+ struct bch_io_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(ret, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
+{
+ struct bch_opts ret = { 0 };
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(ret, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ return ret;
+}
+
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
+{
+#define BCH_INODE_OPT(_name, _bits) \
+ if (opt_defined(src, _name)) \
+ opt_set(*dst, _name, src._name);
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+}
+
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
+{
+ static const enum bch_opt_id inode_opt_list[] = {
+#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+ };
+ unsigned i;
+
+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
+ if (inode_opt_list[i] == id)
+ return true;
+
+ return false;
+}
static inline struct bch_opts bch2_opts_empty(void)
{
- struct bch_opts opts;
-
- memset(&opts, 0, sizeof(opts));
- return opts;
+ return (struct bch_opts) { 0 };
}
void bch2_opts_apply(struct bch_opts *, struct bch_opts);
extern const struct bch_option bch2_opt_table[];
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+int bch2_opt_lookup(const char *);
int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
int bch2_parse_mount_opts(struct bch_opts *, char *);
+/* inode opts: */
+
+#define BCH_INODE_OPTS() \
+ BCH_INODE_OPT(data_checksum, 8) \
+ BCH_INODE_OPT(compression, 8)
+
+struct bch_io_opts {
+#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+
+#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
+ BCH_INODE_OPTS()
+#undef BCH_INODE_OPT
+};
+
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
+
#endif /* _BCACHEFS_OPTS_H */
#include <linux/sort.h>
static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+ struct bch_replicas_cpu *);
static const char *bch2_sb_validate_replicas(struct bch_sb *);
static inline void __bch2_sb_layout_size_assert(void)
return NULL;
f = __bch2_sb_field_resize(sb->sb, f, u64s);
- f->type = type;
+ f->type = cpu_to_le32(type);
return f;
}
}
f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
- f->type = type;
+ f->type = cpu_to_le32(type);
return f;
}
if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
- return "Invalid number of metadata replicas";
+ return "Invalid number of data replicas";
+
+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+ return "Invalid metadata checksum type";
+
+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
+ return "Invalid metadata checksum type";
+
+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
+ return "Invalid compression type";
if (!BCH_SB_BTREE_NODE_SIZE(sb))
return "Btree node size not set";
if (src_f->type == BCH_SB_FIELD_journal)
continue;
- dst_f = bch2_sb_field_get(dst, src_f->type);
+ dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
dst_f = __bch2_sb_field_resize(dst, dst_f,
le32_to_cpu(src_f->u64s));
/* XXX: verify MACs */
csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- (struct nonce) { 0 }, sb->sb);
+ null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum))
return "bad checksum reading superblock";
got_super:
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
le64_to_cpu(ret->sb->version),
- le64_to_cpu(ret->sb->flags),
+ le64_to_cpu(ret->sb->flags[0]),
le64_to_cpu(ret->sb->seq),
- le16_to_cpu(ret->sb->u64s));
+ le32_to_cpu(ret->sb->u64s));
err = "Superblock block size smaller than device block size";
if (le16_to_cpu(ret->sb->block_size) << 9 <
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write"))
ca->sb_write_error = 1;
closure_put(&ca->fs->sb_write);
SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
- (struct nonce) { 0 }, sb);
+ null_nonce(), sb);
bio_reset(bio);
bio->bi_bdev = ca->disk_sb.bdev;
bch2_sb_update(c);
}
-/* replica information: */
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+ _i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
return (void *) r->entries + r->entry_size * i;
}
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
+static unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+ enum bch_data_type data_type,
+ struct bch_replicas_cpu_entry *r,
+ unsigned *max_dev)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned nr = 0;
+
+ BUG_ON(!data_type ||
+ data_type == BCH_DATA_SB ||
+ data_type >= BCH_DATA_NR);
+
+ memset(r, 0, sizeof(*r));
+ r->data_type = data_type;
+
+ *max_dev = 0;
+
+ extent_for_each_ptr(e, ptr)
+ if (!ptr->cached) {
+ *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+ replicas_set_dev(r, ptr->dev);
+ nr++;
+ }
+ return nr;
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *new;
+ unsigned i, nr, entry_size;
+
+ entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+ DIV_ROUND_UP(max_dev + 1, 8);
+ entry_size = max(entry_size, old->entry_size);
+ nr = old->nr + 1;
+
+ new = kzalloc(sizeof(struct bch_replicas_cpu) +
+ nr * entry_size, GFP_NOIO);
+ if (!new)
+ return NULL;
+
+ new->nr = nr;
+ new->entry_size = entry_size;
+
+ for (i = 0; i < old->nr; i++)
+ memcpy(cpu_replicas_entry(new, i),
+ cpu_replicas_entry(old, i),
+ min(new->entry_size, old->entry_size));
+
+ memcpy(cpu_replicas_entry(new, old->nr),
+ &new_entry,
+ new->entry_size);
+
+ bch2_cpu_replicas_sort(new);
+ return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
+{
+ return max_dev < replicas_dev_slots(r) &&
+ eytzinger0_find(r->entries, r->nr,
+ r->entry_size,
+ memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+ struct bch_replicas_cpu_entry new_entry,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r;
+ int ret = -ENOMEM;
+
+ mutex_lock(&c->sb_lock);
+
+ old_gc = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+ if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+ new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+ if (!new_gc)
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+ /* recheck, might have raced */
+ if (replicas_has_entry(old_r, new_entry, max_dev))
+ goto out;
+
+ new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+ if (!new_r)
+ goto err;
+
+ ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+ if (ret)
+ goto err;
+
+ if (new_gc) {
+ rcu_assign_pointer(c->replicas_gc, new_gc);
+ kfree_rcu(old_gc, rcu);
+ }
+
+ rcu_assign_pointer(c->replicas, new_r);
+ kfree_rcu(old_r, rcu);
+
+ bch2_write_super(c);
+out:
+ ret = 0;
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+static inline int __bch2_check_mark_super(struct bch_fs *c,
+ struct bch_replicas_cpu_entry search,
+ unsigned max_dev)
+{
+ struct bch_replicas_cpu *r, *gc_r;
+ bool marked;
+
+ rcu_read_lock();
+ r = rcu_dereference(c->replicas);
+ gc_r = rcu_dereference(c->replicas_gc);
+ marked = replicas_has_entry(r, search, max_dev) &&
+ (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+ rcu_read_unlock();
+
+ return likely(marked) ? 0
+ : bch2_check_mark_super_slowpath(c, search, max_dev);
+}
+
+int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+ return 0;
+
+ return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_check_mark_super_devlist(struct bch_fs *c,
+ struct bch_devs_list *devs,
+ enum bch_data_type data_type)
+{
+ struct bch_replicas_cpu_entry search = { .data_type = data_type };
+ unsigned i, max_dev = 0;
+
+ if (!devs->nr)
+ return 0;
+
+ for (i = 0; i < devs->nr; i++) {
+ max_dev = max_t(unsigned, max_dev, devs->devs[i]);
+ replicas_set_dev(&search, devs->devs[i]);
+ }
+
+ return __bch2_check_mark_super(c, search, max_dev);
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+ struct bch_replicas_cpu *new_r, *old_r;
+ int ret = 0;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+
+ new_r = rcu_dereference_protected(c->replicas_gc,
+ lockdep_is_held(&c->sb_lock));
+
+ if (err) {
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(new_r, rcu);
+ goto err;
+ }
+
+ if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+ ret = -ENOSPC;
+ goto err;
+ }
+
+ old_r = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ rcu_assign_pointer(c->replicas, new_r);
+ rcu_assign_pointer(c->replicas_gc, NULL);
+ kfree_rcu(old_r, rcu);
+
+ bch2_write_super(c);
+err:
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+ struct bch_replicas_cpu *dst, *src;
+ struct bch_replicas_cpu_entry *e;
+
+ lockdep_assert_held(&c->replicas_gc_lock);
+
+ mutex_lock(&c->sb_lock);
+ BUG_ON(c->replicas_gc);
+
+ src = rcu_dereference_protected(c->replicas,
+ lockdep_is_held(&c->sb_lock));
+
+ dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+ src->nr * src->entry_size, GFP_NOIO);
+ if (!dst) {
+ mutex_unlock(&c->sb_lock);
+ return -ENOMEM;
+ }
+
+ dst->nr = 0;
+ dst->entry_size = src->entry_size;
+
+ for_each_cpu_replicas_entry(src, e)
+ if (!((1 << e->data_type) & typemask))
+ memcpy(cpu_replicas_entry(dst, dst->nr++),
+ e, dst->entry_size);
+
+ bch2_cpu_replicas_sort(dst);
+
+ rcu_assign_pointer(c->replicas_gc, dst);
+ mutex_unlock(&c->sb_lock);
+
+ return 0;
+}
+
+/* Replicas tracking - superblock: */
+
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
}
}
- eytzinger0_sort(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- memcmp, NULL);
+ bch2_cpu_replicas_sort(cpu_r);
return cpu_r;
}
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
- lockdep_assert_held(&c->sb_lock);
-
sb_r = bch2_sb_get_replicas(c->disk_sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
- old_r = c->replicas;
+ old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
return 0;
}
-static void bkey_to_replicas(struct bkey_s_c_extent e,
- enum bch_data_type data_type,
- struct bch_replicas_cpu_entry *r,
- unsigned *max_dev)
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+ struct bch_replicas_cpu *r)
{
- const struct bch_extent_ptr *ptr;
-
- BUG_ON(!data_type ||
- data_type == BCH_DATA_SB ||
- data_type >= BCH_DATA_NR);
-
- memset(r, 0, sizeof(*r));
- r->data_type = data_type;
-
- *max_dev = 0;
-
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached) {
- *max_dev = max_t(unsigned, *max_dev, ptr->dev);
- replicas_set_dev(r, ptr->dev);
- }
-}
+ struct bch_sb_field_replicas *sb_r;
+ struct bch_replicas_entry *sb_e;
+ struct bch_replicas_cpu_entry *e;
+ size_t i, bytes;
-/*
- * for when gc of replica information is in progress:
- */
-static int bch2_update_gc_replicas(struct bch_fs *c,
- struct bch_replicas_cpu *gc_r,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry new_e;
- struct bch_replicas_cpu *new;
- unsigned i, nr, entry_size, max_dev;
+ bytes = sizeof(struct bch_sb_field_replicas);
- bkey_to_replicas(e, data_type, &new_e, &max_dev);
+ for_each_cpu_replicas_entry(r, e) {
+ bytes += sizeof(struct bch_replicas_entry);
+ for (i = 0; i < r->entry_size - 1; i++)
+ bytes += hweight8(e->devs[i]);
+ }
- entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
- DIV_ROUND_UP(max_dev + 1, 8);
- entry_size = max(entry_size, gc_r->entry_size);
- nr = gc_r->nr + 1;
+ sb_r = bch2_fs_sb_resize_replicas(c,
+ DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+ if (!sb_r)
+ return -ENOSPC;
- new = kzalloc(sizeof(struct bch_replicas_cpu) +
- nr * entry_size, GFP_NOIO);
- if (!new)
- return -ENOMEM;
+ memset(&sb_r->entries, 0,
+ vstruct_end(&sb_r->field) -
+ (void *) &sb_r->entries);
- new->nr = nr;
- new->entry_size = entry_size;
+ sb_e = sb_r->entries;
+ for_each_cpu_replicas_entry(r, e) {
+ sb_e->data_type = e->data_type;
- for (i = 0; i < gc_r->nr; i++)
- memcpy(cpu_replicas_entry(new, i),
- cpu_replicas_entry(gc_r, i),
- gc_r->entry_size);
+ for (i = 0; i < replicas_dev_slots(r); i++)
+ if (replicas_test_dev(e, i))
+ sb_e->devs[sb_e->nr++] = i;
- memcpy(cpu_replicas_entry(new, nr - 1),
- &new_e,
- new->entry_size);
+ sb_e = replicas_entry_next(sb_e);
- eytzinger0_sort(new->entries,
- new->nr,
- new->entry_size,
- memcmp, NULL);
+ BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+ }
- rcu_assign_pointer(c->replicas_gc, new);
- kfree_rcu(gc_r, rcu);
return 0;
}
-static bool replicas_has_extent(struct bch_replicas_cpu *r,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- struct bch_replicas_cpu_entry search;
- unsigned max_dev;
-
- bkey_to_replicas(e, data_type, &search, &max_dev);
-
- return max_dev < replicas_dev_slots(r) &&
- eytzinger0_find(r->entries, r->nr,
- r->entry_size,
- memcmp, &search) < r->nr;
-}
-
-bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
- enum bch_data_type data_type)
-{
- bool ret;
-
- rcu_read_lock();
- ret = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type);
- rcu_read_unlock();
-
- return ret;
-}
-
-noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
- struct bkey_s_c_extent e,
- enum bch_data_type data_type)
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
{
- struct bch_replicas_cpu *gc_r;
- const struct bch_extent_ptr *ptr;
+ struct bch_sb_field_members *mi;
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *new_entry;
- unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev;
- int ret = 0;
+ struct bch_replicas_cpu *cpu_r = NULL;
+ struct bch_replicas_entry *e;
+ const char *err;
+ unsigned i;
- mutex_lock(&c->sb_lock);
+ mi = bch2_sb_get_members(sb);
+ sb_r = bch2_sb_get_replicas(sb);
+ if (!sb_r)
+ return NULL;
- gc_r = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
- if (gc_r &&
- !replicas_has_extent(gc_r, e, data_type)) {
- ret = bch2_update_gc_replicas(c, gc_r, e, data_type);
- if (ret)
+ for_each_replicas_entry(sb_r, e) {
+ err = "invalid replicas entry: invalid data type";
+ if (e->data_type >= BCH_DATA_NR)
goto err;
- }
-
- /* recheck, might have raced */
- if (bch2_sb_has_replicas(c, e, data_type)) {
- mutex_unlock(&c->sb_lock);
- return 0;
- }
- new_entry_bytes = sizeof(struct bch_replicas_entry) +
- bch2_extent_nr_dirty_ptrs(e.s_c);
-
- sb_r = bch2_sb_get_replicas(c->disk_sb);
+ err = "invalid replicas entry: no devices";
+ if (!e->nr)
+ goto err;
- bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+ err = "invalid replicas entry: too many devices";
+ if (e->nr >= BCH_REPLICAS_MAX)
+ goto err;
- new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64));
+ err = "invalid replicas entry: invalid device";
+ for (i = 0; i < e->nr; i++)
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
+ goto err;
+ }
- sb_r = bch2_fs_sb_resize_replicas(c,
- DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes,
- sizeof(u64)));
- if (!sb_r) {
- ret = -ENOSPC;
+ err = "cannot allocate memory";
+ cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+ if (!cpu_r)
goto err;
- }
- new_entry = (void *) sb_r + bytes;
- new_entry->data_type = data_type;
- new_entry->nr = 0;
+ sort_cmp_size(cpu_r->entries,
+ cpu_r->nr,
+ cpu_r->entry_size,
+ memcmp, NULL);
+
+ for (i = 0; i + 1 < cpu_r->nr; i++) {
+ struct bch_replicas_cpu_entry *l =
+ cpu_replicas_entry(cpu_r, i);
+ struct bch_replicas_cpu_entry *r =
+ cpu_replicas_entry(cpu_r, i + 1);
- extent_for_each_ptr(e, ptr)
- if (!ptr->cached)
- new_entry->devs[new_entry->nr++] = ptr->dev;
+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
- ret = bch2_sb_replicas_to_cpu_replicas(c);
- if (ret) {
- memset(new_entry, 0,
- vstruct_end(&sb_r->field) - (void *) new_entry);
- goto err;
+ err = "duplicate replicas entry";
+ if (!memcmp(l, r, cpu_r->entry_size))
+ goto err;
}
- bch2_write_super(c);
+ err = NULL;
err:
- mutex_unlock(&c->sb_lock);
- return ret;
+ kfree(cpu_r);
+ return err;
}
-int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
+/* Query replicas: */
+
+bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e,
enum bch_data_type data_type)
{
- struct bch_replicas_cpu *gc_r;
- bool marked;
+ struct bch_replicas_cpu_entry search;
+ unsigned max_dev;
+ bool ret;
+
+ if (!bkey_to_replicas(e, data_type, &search, &max_dev))
+ return true;
rcu_read_lock();
- marked = replicas_has_extent(rcu_dereference(c->replicas),
- e, data_type) &&
- (!(gc_r = rcu_dereference(c->replicas_gc)) ||
- replicas_has_extent(gc_r, e, data_type));
+ ret = replicas_has_entry(rcu_dereference(c->replicas),
+ search, max_dev);
rcu_read_unlock();
- if (marked)
- return 0;
-
- return bch2_check_mark_super_slowpath(c, e, data_type);
+ return ret;
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
- struct bch_devs_mask online_devs)
+ struct bch_devs_mask online_devs)
{
+ struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
+ mi = bch2_sb_get_members(c->disk_sb);
rcu_read_lock();
- r = rcu_dereference(c->replicas);
- dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices);
- for (i = 0; i < r->nr; i++) {
- e = cpu_replicas_entry(r, i);
+ r = rcu_dereference(c->replicas);
+ dev_slots = replicas_dev_slots(r);
- BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas));
+ for_each_cpu_replicas_entry(r, e) {
+ if (e->data_type >= ARRAY_SIZE(ret.replicas))
+ panic("e %p data_type %u\n", e, e->data_type);
nr_online = nr_offline = 0;
if (!replicas_test_dev(e, dev))
continue;
+ BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
+
if (test_bit(dev, online_devs.d))
nr_online++;
else
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
- unsigned i, ret = 0;
+ unsigned ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
- for (i = 0; i < r->nr; i++) {
- e = cpu_replicas_entry(r, i);
-
+ for_each_cpu_replicas_entry(r, e)
if (replicas_test_dev(e, ca->dev_idx)) {
ret |= 1 << e->data_type;
break;
}
- }
out:
rcu_read_unlock();
return ret;
}
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb)
-{
- struct bch_sb_field_members *mi;
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_cpu *cpu_r = NULL;
- struct bch_replicas_entry *e;
- const char *err;
- unsigned i;
-
- mi = bch2_sb_get_members(sb);
- sb_r = bch2_sb_get_replicas(sb);
- if (!sb_r)
- return NULL;
-
- for_each_replicas_entry(sb_r, e) {
- err = "invalid replicas entry: invalid data type";
- if (e->data_type >= BCH_DATA_NR)
- goto err;
-
- err = "invalid replicas entry: too many devices";
- if (e->nr >= BCH_REPLICAS_MAX)
- goto err;
-
- err = "invalid replicas entry: invalid device";
- for (i = 0; i < e->nr; i++)
- if (!bch2_dev_exists(sb, mi, e->devs[i]))
- goto err;
- }
-
- err = "cannot allocate memory";
- cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
- if (!cpu_r)
- goto err;
-
- sort_cmp_size(cpu_r->entries,
- cpu_r->nr,
- cpu_r->entry_size,
- memcmp, NULL);
-
- for (i = 0; i + 1 < cpu_r->nr; i++) {
- struct bch_replicas_cpu_entry *l =
- cpu_replicas_entry(cpu_r, i);
- struct bch_replicas_cpu_entry *r =
- cpu_replicas_entry(cpu_r, i + 1);
-
- BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
- err = "duplicate replicas entry";
- if (!memcmp(l, r, cpu_r->entry_size))
- goto err;
- }
-
- err = NULL;
-err:
- kfree(cpu_r);
- return err;
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
- struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_cpu *r, *old_r;
- struct bch_replicas_entry *dst_e;
- size_t i, j, bytes, dev_slots;
- int ret = 0;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
-
- r = rcu_dereference_protected(c->replicas_gc,
- lockdep_is_held(&c->sb_lock));
-
- if (err) {
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(r, rcu);
- goto err;
- }
-
- dev_slots = replicas_dev_slots(r);
-
- bytes = sizeof(struct bch_sb_field_replicas);
-
- for (i = 0; i < r->nr; i++) {
- struct bch_replicas_cpu_entry *e =
- cpu_replicas_entry(r, i);
-
- bytes += sizeof(struct bch_replicas_entry);
- for (j = 0; j < r->entry_size - 1; j++)
- bytes += hweight8(e->devs[j]);
- }
-
- sb_r = bch2_fs_sb_resize_replicas(c,
- DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
- if (!sb_r) {
- ret = -ENOSPC;
- goto err;
- }
-
- memset(&sb_r->entries, 0,
- vstruct_end(&sb_r->field) -
- (void *) &sb_r->entries);
-
- dst_e = sb_r->entries;
- for (i = 0; i < r->nr; i++) {
- struct bch_replicas_cpu_entry *src_e =
- cpu_replicas_entry(r, i);
-
- dst_e->data_type = src_e->data_type;
-
- for (j = 0; j < dev_slots; j++)
- if (replicas_test_dev(src_e, j))
- dst_e->devs[dst_e->nr++] = j;
-
- dst_e = replicas_entry_next(dst_e);
- }
-
- old_r = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
- rcu_assign_pointer(c->replicas, r);
- rcu_assign_pointer(c->replicas_gc, NULL);
- kfree_rcu(old_r, rcu);
-
- bch2_write_super(c);
-err:
- mutex_unlock(&c->sb_lock);
- return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
- struct bch_replicas_cpu *r, *src;
- unsigned i;
-
- lockdep_assert_held(&c->replicas_gc_lock);
-
- mutex_lock(&c->sb_lock);
- BUG_ON(c->replicas_gc);
-
- src = rcu_dereference_protected(c->replicas,
- lockdep_is_held(&c->sb_lock));
-
- r = kzalloc(sizeof(struct bch_replicas_cpu) +
- src->nr * src->entry_size, GFP_NOIO);
- if (!r) {
- mutex_unlock(&c->sb_lock);
- return -ENOMEM;
- }
-
- r->entry_size = src->entry_size;
- r->nr = 0;
-
- for (i = 0; i < src->nr; i++) {
- struct bch_replicas_cpu_entry *dst_e =
- cpu_replicas_entry(r, r->nr);
- struct bch_replicas_cpu_entry *src_e =
- cpu_replicas_entry(src, i);
-
- if (!(src_e->data_type & typemask)) {
- memcpy(dst_e, src_e, r->entry_size);
- r->nr++;
- }
- }
-
- eytzinger0_sort(r->entries,
- r->nr,
- r->entry_size,
- memcmp, NULL);
-
- rcu_assign_pointer(c->replicas_gc, r);
- mutex_unlock(&c->sb_lock);
-
- return 0;
-}
/* replicas: */
-/* iterate over bch_sb_field_replicas: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
- return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i) \
- for (_i = (_r)->entries; \
- (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
- (_i) = replicas_entry_next(_i))
-
bool bch2_sb_has_replicas(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
int bch2_check_mark_super(struct bch_fs *, struct bkey_s_c_extent,
enum bch_data_type);
+int bch2_check_mark_super_devlist(struct bch_fs *, struct bch_devs_list *,
+ enum bch_data_type);
struct replicas_status {
struct {
int bch2_replicas_gc_end(struct bch_fs *, int);
int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+ return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i) \
+ for (_i = (_r)->entries; \
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+ (_i) = replicas_entry_next(_i))
+
#endif /* _BCACHEFS_SUPER_IO_H */
return c;
}
-int bch2_congested(struct bch_fs *c, int bdi_bits)
+int bch2_congested(void *data, int bdi_bits)
{
+ struct bch_fs *c = data;
struct backing_dev_info *bdi;
struct bch_dev *ca;
unsigned i;
return ret;
}
-static int bch2_congested_fn(void *data, int bdi_bits)
-{
- struct bch_fs *c = data;
-
- return bch2_congested(c, bdi_bits);
-}
-
/* Filesystem RO/RW: */
/*
* Flush journal before stopping allocators, because flushing journal
* blacklist entries involves allocating new btree nodes:
*/
- bch2_journal_flush_pins(&c->journal, U64_MAX);
+ bch2_journal_flush_all_pins(&c->journal);
if (!bch2_journal_error(&c->journal))
bch2_btree_verify_flushed(c);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- if (c->bdi.bdi_list.next)
- bdi_destroy(&c->bdi);
lg_lock_free(&c->usage_lock);
free_percpu(c->usage_percpu);
mempool_exit(&c->btree_bounce_pool);
mempool_exit(&c->btree_reserve_pool);
mempool_exit(&c->fill_iter);
percpu_ref_exit(&c->writes);
- kfree(c->replicas);
+ kfree(rcu_dereference_protected(c->replicas, 1));
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
for (i = 0; i < c->sb.nr_devices; i++)
if (c->devs[i])
- bch2_dev_free(c->devs[i]);
+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
closure_debug_destroy(&c->cl);
kobject_put(&c->kobj);
sizeof(struct btree_update)) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_read_bio, 1,
- offsetof(struct btree_read_bio, bio)) ||
- bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio)) ||
- bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct btree_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+ BIOSET_NEED_BVECS) ||
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+ BIOSET_NEED_BVECS) ||
mempool_init_page_pool(&c->bio_bounce_pages,
max_t(unsigned,
c->opts.btree_node_size,
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
lg_lock_init(&c->usage_lock) ||
mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
- bdi_setup_and_register(&c->bdi, "bcachefs") ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
bch2_fs_fsio_init(c))
goto err;
- c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- c->bdi.congested_fn = bch2_congested_fn;
- c->bdi.congested_data = c;
-
mi = bch2_sb_get_members(c->disk_sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb, mi, i) &&
continue;
err = "error reading btree root";
- if (bch2_btree_root_read(c, i, k, level))
- goto err;
+ if (bch2_btree_root_read(c, i, k, level)) {
+ if (i != BTREE_ID_ALLOC)
+ goto err;
+
+ mustfix_fsck_err(c, "error reading btree root");
+ }
}
err = "error reading allocation information";
closure_sync(&cl);
bch2_inode_init(c, &inode, 0, 0,
- S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0);
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
inode.bi_inum = BCACHEFS_ROOT_INO;
bch2_inode_pack(&packed_inode, &inode);
bch2_journal_entries_free(&journal);
return err;
err:
+fsck_err:
closure_sync(&cl);
switch (ret) {
kobject_put(&ca->kobj);
}
-static void bch2_dev_io_ref_release(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->offline_complete);
-}
-
static void __bch2_dev_offline(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
lockdep_assert_held(&c->state_lock);
+ if (percpu_ref_is_zero(&ca->io_ref))
+ return;
+
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->offline_complete);
+ reinit_completion(&ca->io_ref_completion);
percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->offline_complete);
+ wait_for_completion(&ca->io_ref_completion);
if (ca->kobj.state_in_sysfs) {
struct kobject *block =
bch2_dev_journal_exit(ca);
}
-static void bch2_dev_ref_release(struct percpu_ref *ref)
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
{
struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
- complete(&ca->stop_complete);
+ complete(&ca->ref_completion);
}
-static void bch2_dev_stop(struct bch_dev *ca)
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
{
- struct bch_fs *c = ca->fs;
-
- lockdep_assert_held(&c->state_lock);
-
- BUG_ON(rcu_access_pointer(c->devs[ca->dev_idx]) != ca);
- rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
-
- synchronize_rcu();
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
- reinit_completion(&ca->stop_complete);
- percpu_ref_kill(&ca->ref);
- wait_for_completion(&ca->stop_complete);
+ complete(&ca->io_ref_completion);
}
static int bch2_dev_sysfs_online(struct bch_dev *ca)
return -ENOMEM;
kobject_init(&ca->kobj, &bch2_dev_ktype);
- init_completion(&ca->stop_complete);
- init_completion(&ca->offline_complete);
+ init_completion(&ca->ref_completion);
+ init_completion(&ca->io_ref_completion);
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
- if (percpu_ref_init(&ca->ref, bch2_dev_ref_release,
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release,
+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
!init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets,
GFP_KERNEL) ||
GFP_KERNEL|__GFP_ZERO)) ||
!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) ||
bioset_init(&ca->replica_set, 4,
- offsetof(struct bch_write_bio, bio)) ||
+ offsetof(struct bch_write_bio, bio), 0) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
goto err;
struct bch_dev *ca;
int ret;
- lockdep_assert_held(&c->sb_lock);
-
if (le64_to_cpu(sb->sb->seq) >
le64_to_cpu(c->disk_sb->seq))
bch2_sb_to_fs(c, sb->sb);
BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
!c->devs[sb->sb->dev_idx]);
- ca = c->devs[sb->sb->dev_idx];
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
if (ca->disk_sb.bdev) {
bch_err(c, "already have device online in slot %u",
sb->sb->dev_idx);
return -EINVAL;
}
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
if (bch2_dev_sysfs_online(ca))
pr_warn("error creating sysfs objects");
- bch2_mark_dev_superblock(c, ca, 0);
+ bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
if (ca->mi.state == BCH_MEMBER_STATE_RW)
bch2_dev_allocator_add(c, ca);
{
struct replicas_status s;
struct bch_sb_field_members *mi;
+ struct bch_dev *ca;
unsigned i, flags = c->opts.degraded
? BCH_FORCE_IF_DEGRADED
: 0;
mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb);
- for (i = 0; i < c->disk_sb->nr_devices; i++)
- if (bch2_dev_exists(c->disk_sb, mi, i) &&
- !bch2_dev_is_online(c->devs[i]) &&
- (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW ||
- c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) {
+ for (i = 0; i < c->disk_sb->nr_devices; i++) {
+ if (!bch2_dev_exists(c->disk_sb, mi, i))
+ continue;
+
+ ca = bch_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_RW ||
+ ca->mi.state == BCH_MEMBER_STATE_RO)) {
mutex_unlock(&c->sb_lock);
return false;
}
+ }
mutex_unlock(&c->sb_lock);
}
*
* flag_data_bad() does not check btree pointers
*/
- ret = bch2_flag_data_bad(ca);
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
if (ret) {
- bch_err(ca, "Remove failed");
+ bch_err(ca, "Remove failed: error %i dropping data", ret);
+ goto err;
+ }
+
+ ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+ if (ret) {
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
- bch_err(ca, "Remove failed, still has data (%x)", data);
+ char data_has_str[100];
+ bch2_scnprint_flag_list(data_has_str,
+ sizeof(data_has_str),
+ bch2_data_types,
+ data);
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
+ ret = -EBUSY;
goto err;
}
- bch2_journal_meta(&c->journal);
+ ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC,
+ POS(ca->dev_idx, 0),
+ POS(ca->dev_idx + 1, 0),
+ ZERO_VERSION,
+ NULL, NULL, NULL);
+ if (ret) {
+ bch_err(ca, "Remove failed, error deleting alloc info");
+ goto err;
+ }
+
+ /*
+ * must flush all existing journal entries, they might have
+ * (overwritten) keys that point to the device we're removing:
+ */
+ ret = bch2_journal_flush_all_pins(&c->journal);
+ if (ret) {
+ bch_err(ca, "Remove failed, journal error");
+ goto err;
+ }
__bch2_dev_offline(ca);
- bch2_dev_stop(ca);
+
+ mutex_lock(&c->sb_lock);
+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
+ mutex_unlock(&c->sb_lock);
+
+ percpu_ref_kill(&ca->ref);
+ wait_for_completion(&ca->ref_completion);
+
bch2_dev_free(ca);
/*
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = "journal alloc failed";
if (bch2_dev_journal_alloc(ca))
/* Hot add existing device to running filesystem: */
int bch2_dev_online(struct bch_fs *c, const char *path)
{
- struct bch_sb_handle sb = { 0 };
+ struct bch_sb_handle sb = { NULL };
struct bch_dev *ca;
unsigned dev_idx;
const char *err;
}
mutex_unlock(&c->sb_lock);
- ca = c->devs[dev_idx];
+ ca = bch_dev_locked(c, dev_idx);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
err = __bch2_dev_read_write(c, ca);
if (err)
return -EINVAL;
}
- __bch2_dev_read_only(c, ca);
__bch2_dev_offline(ca);
mutex_unlock(&c->state_lock);
int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
{
unsigned data;
- int ret;
+ int ret = 0;
mutex_lock(&c->state_lock);
if (ca->mi.state == BCH_MEMBER_STATE_RW) {
bch_err(ca, "Cannot migrate data off RW device");
- mutex_unlock(&c->state_lock);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
- mutex_unlock(&c->state_lock);
-
- ret = bch2_move_data_off_device(ca);
+ ret = bch2_dev_data_migrate(c, ca, 0);
if (ret) {
bch_err(ca, "Error migrating data: %i", ret);
- return ret;
- }
-
- ret = bch2_move_metadata_off_device(ca);
- if (ret) {
- bch_err(ca, "Error migrating metadata: %i", ret);
- return ret;
+ goto err;
}
data = bch2_dev_has_data(c, ca);
if (data) {
bch_err(ca, "Migrate error: data still present (%x)", data);
- return -EINVAL;
+ ret = -EINVAL;
+ goto err;
}
-
- return 0;
+err:
+ mutex_unlock(&c->state_lock);
+ return ret;
}
/* Filesystem open: */
}
}
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+ unsigned dev)
+{
+ BUG_ON(bch2_dev_list_has_dev(*devs, dev));
+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
+ devs->devs[devs->nr++] = dev;
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
struct bch_devs_mask *mask)
{
__for_each_online_member(ca, c, iter, \
(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+ return rcu_dereference_protected(c->devs[idx],
+ lockdep_is_held(&c->sb_lock) ||
+ lockdep_is_held(&c->state_lock));
+}
+
/* XXX kill, move to struct bch_fs */
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
struct bch_fs *bch2_bdev_to_fs(struct block_device *);
struct bch_fs *bch2_uuid_to_fs(uuid_le);
-int bch2_congested(struct bch_fs *, int);
+int bch2_congested(void *, int);
bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
enum bch_member_state, int);
c->open_buckets_wait.list.first ? "waiting" : "empty");
}
-const char * const bch2_rw[] = {
+static const char * const bch2_rw[] = {
"read",
"write",
NULL
#include "clock.h"
#include "extents.h"
#include "io.h"
-#include "keylist.h"
#include "move.h"
#include "super-io.h"
#include "tier.h"
return false;
extent_for_each_ptr(e, ptr)
- if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+ if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
replicas++;
return replicas < c->opts.data_replicas;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-#define memcpy(_dst, _src, _len) \
+#define memcpy(dst, src, len) \
({ \
+ void *_dst = (dst); \
+ const void *_src = (src); \
+ size_t _len = (len); \
+ \
BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
(void *) (_dst) + (_len) <= (void *) (_src))); \
memcpy(_dst, _src, _len); \
*/
#define __vstruct_u64s(_s) \
({ \
- ( type_is((_s)->u64s, u64) ? le64_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u32) ? le32_to_cpu((_s)->u64s) \
- : type_is((_s)->u64s, u16) ? le16_to_cpu((_s)->u64s) \
- : ((_s)->u64s)); \
+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
+ : ((__force u8) ((_s)->u64s))); \
})
#define __vstruct_bytes(_type, _u64s) \
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_update.h"
+#include "compress.h"
#include "extents.h"
#include "fs.h"
#include "str_hash.h"
.flags = BCH_XATTR_INDEX_SECURITY,
};
-static const struct xattr_handler *bch_xattr_handler_map[] = {
- [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
- &posix_acl_access_xattr_handler,
- [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
- &posix_acl_default_xattr_handler,
- [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
- [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, void *buffer, size_t size)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_opts opts =
+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
+ const struct bch_option *opt;
+ int ret, id;
+ u64 v;
+
+ id = bch2_opt_lookup(name);
+ if (id < 0 || !bch2_opt_is_inode_opt(id))
+ return -EINVAL;
+
+ opt = bch2_opt_table + id;
+
+ if (!bch2_opt_defined_by_id(&opts, id))
+ return -ENODATA;
+
+ v = bch2_opt_get_by_id(&opts, id);
+
+ if (opt->type == BCH_OPT_STR)
+ ret = snprintf(buffer, size, "%s", opt->choices[v]);
+ else
+ ret = snprintf(buffer, size, "%llu", v);
+
+ return ret <= size || !buffer ? ret : -ERANGE;
+}
+
+struct inode_opt_set {
+ int id;
+ u64 v;
+ bool defined;
};
+static int inode_opt_set_fn(struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct inode_opt_set *s = p;
+
+ if (s->defined)
+ bch2_inode_opt_set(bi, s->id, s->v);
+ else
+ bch2_inode_opt_clear(bi, s->id);
+ return 0;
+}
+
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *vinode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct bch_inode_info *inode = to_bch_ei(vinode);
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ const struct bch_option *opt;
+ char *buf;
+ struct inode_opt_set s;
+ int ret;
+
+ s.id = bch2_opt_lookup(name);
+ if (s.id < 0 || !bch2_opt_is_inode_opt(s.id))
+ return -EINVAL;
+
+ opt = bch2_opt_table + s.id;
+
+ if (value) {
+ buf = kmalloc(size + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memcpy(buf, value, size);
+ buf[size] = '\0';
+
+ ret = bch2_opt_parse(opt, buf, &s.v);
+ kfree(buf);
+
+ if (ret < 0)
+ return ret;
+
+ if (s.id == Opt_compression) {
+ mutex_lock(&c->sb_lock);
+ ret = bch2_check_set_has_compressed_data(c, s.v);
+ mutex_unlock(&c->sb_lock);
+
+ if (ret)
+ return ret;
+ }
+
+ s.defined = true;
+ } else {
+ s.defined = false;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
+ mutex_unlock(&inode->ei_update_lock);
+
+ return ret;
+}
+
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
+ .prefix = "bcachefs.",
+ .get = bch2_xattr_bcachefs_get,
+ .set = bch2_xattr_bcachefs_set,
+};
+
+#endif /* NO_BCACHEFS_FS */
+
const struct xattr_handler *bch2_xattr_handlers[] = {
&bch_xattr_user_handler,
&posix_acl_access_xattr_handler,
&posix_acl_default_xattr_handler,
&bch_xattr_trusted_handler,
&bch_xattr_security_handler,
+#ifndef NO_BCACHEFS_FS
+ &bch_xattr_bcachefs_handler,
+#endif
NULL
};
+static const struct xattr_handler *bch_xattr_handler_map[] = {
+ [BCH_XATTR_INDEX_USER] = &bch_xattr_user_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_ACCESS] =
+ &posix_acl_access_xattr_handler,
+ [BCH_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+ &posix_acl_default_xattr_handler,
+ [BCH_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
+ [BCH_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
+};
+
static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
{
return type < ARRAY_SIZE(bch_xattr_handler_map)
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/kernel.h>
-#include <linux/export.h>
+
+static const struct {
+ int err;
+ const char *name;
+} blk_errors[] = {
+ [BLK_STS_OK] = { 0, "" },
+ [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" },
+ [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" },
+ [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
+ [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
+ [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
+ [BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
+ [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
+ [BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
+ [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
+ [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
+
+ /* device mapper special case, should not leak out: */
+ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
+
+ /* everything else not covered above: */
+ [BLK_STS_IOERR] = { -EIO, "I/O" },
+};
+
+int blk_status_to_errno(blk_status_t status)
+{
+ int idx = (__force int)status;
+
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
+ return -EIO;
+ return blk_errors[idx].err;
+}
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
struct bio *parent = bio->bi_private;
- if (!parent->bi_error)
- parent->bi_error = bio->bi_error;
+ if (!parent->bi_status)
+ parent->bi_status = bio->bi_status;
bio_put(bio);
return parent;
}
bio->bi_end_io(bio);
}
-void bio_endio_nodec(struct bio *bio)
-{
- goto nodec;
-
- while (bio) {
- if (unlikely(!bio_remaining_done(bio)))
- break;
-nodec:
- if (bio->bi_end_io == bio_chain_endio) {
- struct bio *parent = bio->bi_private;
- parent->bi_error = bio->bi_error;
- bio_put(bio);
- bio = parent;
- } else {
- if (bio->bi_end_io)
- bio->bi_end_io(bio);
- bio = NULL;
- }
- }
-}
-
void bio_reset(struct bio *bio)
{
unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
ret = fdatasync(bio->bi_bdev->bd_fd);
if (ret) {
fprintf(stderr, "fsync error: %m\n");
- bio->bi_error = -EIO;
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
return;
}
submit_bio(bio);
wait_for_completion(&done);
- return bio->bi_error;
+ return blk_status_to_errno(bio->bi_status);
}
int blkdev_issue_discard(struct block_device *bdev,
for (ev = events; ev < events + ret; ev++) {
struct bio *bio = (struct bio *) ev->data;
- if (ev->res < 0)
- bio->bi_error = ev->res;
- else if (ev->res != bio->bi_iter.bi_size)
- bio->bi_error = -EIO;
+ if (ev->res != bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
}