-3b4024f94489e4d8dc8eb7f1278754a2545f8026
+f026e4e0243cc10e721504a8bfaa131ea8aa4c91
libbcachefs/dirent.c \
libbcachefs/error.c \
libbcachefs/extents.c \
- libbcachefs/fs-gc.c \
+ libbcachefs/fsck.c \
libbcachefs/inode.c \
libbcachefs/io.c \
libbcachefs/journal.c \
{
}
-extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
- struct bio *src, struct bvec_iter src_iter);
+extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
BCH_FS_BDEV_MOUNTED,
BCH_FS_ERROR,
BCH_FS_FSCK_FIXED_ERRORS,
+ BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
};
struct work_struct read_retry_work;
spinlock_t read_retry_lock;
+ /* ERRORS */
+ struct list_head fsck_errors;
+ struct mutex fsck_error_lock;
+ bool fsck_alloc_err;
+
/* FILESYSTEM */
wait_queue_head_t writeback_wait;
atomic_t writeback_pages;
ops->key_debugcheck(c, b, k);
}
-void bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+char *bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text)
ops->val_to_text(c, buf, size, k);
+
+ return buf;
}
-void bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
- char *buf, size_t size, struct bkey_s_c k)
+char *bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+ char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->val_to_text) {
- out += scnprintf(out, end - out, " -> ");
+ out += scnprintf(out, end - out, ": ");
ops->val_to_text(c, out, end - out, k);
}
+
+ return buf;
}
void bch2_bkey_swab(enum bkey_type type,
struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-void bch2_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
-void bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
- char *, size_t, struct bkey_s_c);
+char *bch2_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
+char *bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
+ char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
six_lock_init(&b->lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
+ INIT_LIST_HEAD(&b->reachable);
mca_data_alloc(c, b, gfp);
return b->data ? b : NULL;
bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
/* Repack everything with @new_format and sort down to one bset */
- for (i = 0; i < nr_old_nodes; i++)
+ for (i = 0; i < nr_old_nodes; i++) {
new_nodes[i] =
__bch2_btree_node_alloc_replacement(c, old_nodes[i],
new_format, res);
+ list_add(&new_nodes[i]->reachable, &as->reachable_list);
+ }
/*
* Conceptually we concatenate the nodes together and slice them
set_btree_bset_end(n1, n1->set);
+ list_del_init(&n2->reachable);
six_unlock_write(&n2->lock);
bch2_btree_node_free_never_inserted(c, n2);
six_unlock_intent(&n2->lock);
vstruct_end(i) - (void *) i->_data);
}
-#define btree_node_error(b, c, ptr, fmt, ...) \
- bch2_fs_inconsistent(c, \
- "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
- (b)->btree_id, (b)->level, btree_node_root(c, b) \
- ? btree_node_root(c, b)->level : -1, \
- PTR_BUCKET_NR(ca, ptr), (b)->written, \
- le16_to_cpu((i)->u64s), ##__VA_ARGS__)
-
-static const char *validate_bset(struct bch_fs *c, struct btree *b,
- struct bch_dev *ca,
- const struct bch_extent_ptr *ptr,
- struct bset *i, unsigned sectors,
- unsigned *whiteout_u64s)
+#define btree_node_error(c, b, ptr, msg, ...) \
+do { \
+ if (write == READ && \
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
+ mustfix_fsck_err(c, \
+ "btree node read error at btree %u level %u/%u\n"\
+ "sector %llu node offset %u bset u64s %u: " msg,\
+ (b)->btree_id, (b)->level, \
+ (c)->btree_roots[(b)->btree_id].level, \
+ (u64) ptr->offset, (b)->written, \
+ le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
+ } else { \
+ bch_err(c, "%s at btree %u level %u/%u\n" \
+ "sector %llu node offset %u bset u64s %u: " msg,\
+ write == WRITE \
+ ? "corrupt metadata in btree node write" \
+ : "btree node error", \
+ (b)->btree_id, (b)->level, \
+ (c)->btree_roots[(b)->btree_id].level, \
+ (u64) ptr->offset, (b)->written, \
+ le16_to_cpu((i)->u64s), ##__VA_ARGS__); \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+} while (0)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+ const struct bch_extent_ptr *ptr,
+ struct bset *i, unsigned sectors,
+ unsigned *whiteout_u64s,
+ int write)
{
struct bkey_packed *k, *prev = NULL;
struct bpos prev_pos = POS_MIN;
bool seen_non_whiteout = false;
+ int ret = 0;
- if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
- return "unsupported bset version";
+ if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
+ btree_node_error(c, b, ptr, "unsupported bset version");
+ i->u64s = 0;
+ return 0;
+ }
- if (b->written + sectors > c->sb.btree_node_size)
- return "bset past end of btree node";
+ if (b->written + sectors > c->sb.btree_node_size) {
+ btree_node_error(c, b, ptr, "bset past end of btree node");
+ i->u64s = 0;
+ return 0;
+ }
- if (i != &b->data->keys && !i->u64s)
- btree_node_error(b, c, ptr, "empty set");
+ if (b->written && !i->u64s)
+ btree_node_error(c, b, ptr, "empty set");
if (!BSET_SEPARATE_WHITEOUTS(i)) {
seen_non_whiteout = true;
const char *invalid;
if (!k->u64s) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"KEY_U64s 0: %zu bytes of metadata lost",
vstruct_end(i) - (void *) k);
}
if (bkey_next(k) > vstruct_last(i)) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"key extends past end of bset");
i->u64s = cpu_to_le16((u64 *) k - i->_data);
}
if (k->format > KEY_FORMAT_CURRENT) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"invalid bkey format %u", k->format);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
char buf[160];
bch2_bkey_val_to_text(c, btree_node_type(b),
- buf, sizeof(buf), u);
- btree_node_error(b, c, ptr,
+ buf, sizeof(buf), u);
+ btree_node_error(c, b, ptr,
"invalid bkey %s: %s", buf, invalid);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
*whiteout_u64s = k->_data - i->_data;
seen_non_whiteout = true;
} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
- btree_node_error(b, c, ptr,
+ btree_node_error(c, b, ptr,
"keys out of order: %llu:%llu > %llu:%llu",
prev_pos.inode,
prev_pos.offset,
}
SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
- return NULL;
+fsck_err:
+ return ret;
}
static bool extent_contains_ptr(struct bkey_s_c_extent e,
const char *err;
struct bch_csum csum;
struct nonce nonce;
- int ret;
+ int ret, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
__bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
sectors = vstruct_sectors(bne, c->block_bits);
}
- err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
- if (err)
- goto err;
+ ret = validate_bset(c, b, ptr, i, sectors,
+ &whiteout_u64s, READ);
+ if (ret)
+ goto fsck_err;
b->written += sectors;
mempool_free(iter, &c->fill_iter);
return;
err:
+ btree_node_error(c, b, ptr, "%s", err);
+fsck_err:
+ bch2_inconsistent_error(c);
set_btree_node_read_error(b);
- btree_node_error(b, c, ptr, "%s", err);
goto out;
}
}
}
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+ struct bset *i, unsigned sectors)
+{
+ const struct bch_extent_ptr *ptr;
+ unsigned whiteout_u64s = 0;
+ int ret;
+
+ extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+ break;
+
+ ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
+ if (ret)
+ bch2_fatal_error(c);
+
+ return ret;
+}
+
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
struct closure *parent,
enum six_lock_type lock_type_held)
if (!(old & (1 << BTREE_NODE_dirty)))
return;
+ if (b->written &&
+ !btree_node_may_write(b))
+ return;
+
if (old & (1 << BTREE_NODE_write_in_flight)) {
btree_node_wait_on_io(b);
continue;
}
new &= ~(1 << BTREE_NODE_dirty);
+ new &= ~(1 << BTREE_NODE_need_write);
new |= (1 << BTREE_NODE_write_in_flight);
new |= (1 << BTREE_NODE_just_written);
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(!list_empty_careful(&b->reachable) != !b->written);
BUG_ON(b->written >= c->sb.btree_node_size);
BUG_ON(bset_written(b, btree_bset_last(b)));
clear_needs_whiteout(i);
- if (b->written && !i->u64s) {
- /* Nothing to write: */
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
+ /* do we have data to write? */
+ if (b->written && !i->u64s)
+ goto nowrite;
+
+ bytes_to_write = vstruct_end(i) - data;
+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+ memset(data + bytes_to_write, 0,
+ (sectors_to_write << 9) - bytes_to_write);
+ BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
BUG_ON(i->seq != b->data->keys.seq);
nonce = btree_nonce(b, i, b->written << 9);
+ /* if we're going to be encrypting, check metadata validity first: */
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
+
if (bn) {
bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
&bn->flags,
bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
}
- bytes_to_write = vstruct_end(i) - data;
- sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
- memset(data + bytes_to_write, 0,
- (sectors_to_write << 9) - bytes_to_write);
-
- BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
-
- trace_btree_write(b, bytes_to_write, sectors_to_write);
+ /* if we're not encrypting, check metadata after checksumming: */
+ if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+ validate_bset_for_write(c, b, i, sectors_to_write))
+ goto err;
/*
* We handle btree write errors by immediately halting the journal -
* break:
*/
if (bch2_journal_error(&c->journal) ||
- c->opts.nochanges) {
- set_btree_node_noevict(b);
- b->written += sectors_to_write;
+ c->opts.nochanges)
+ goto err;
- btree_bounce_free(c, order, used_mempool, data);
- btree_node_write_done(c, b);
- return;
- }
+ trace_btree_write(b, bytes_to_write, sectors_to_write);
bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
b->written += sectors_to_write;
bch2_submit_wbio_replicas(wbio, c, &k.key);
+ return;
+err:
+ set_btree_node_noevict(b);
+ b->written += sectors_to_write;
+nowrite:
+ btree_bounce_free(c, order, used_mempool, data);
+ btree_node_write_done(c, b);
}
/*
static inline bool btree_node_may_write(struct btree *b)
{
- return list_empty_careful(&b->write_blocked);
+ return list_empty_careful(&b->write_blocked) &&
+ list_empty_careful(&b->reachable);
}
enum compact_mode {
#define bch2_btree_node_write_dirty(_c, _b, _cl, cond) \
do { \
while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
+ set_btree_node_need_write(_b); \
+ \
if (!btree_node_may_write(_b)) \
break; \
\
prefetch(c->btree_roots[btree_id].b);
}
+void bch2_btree_iter_unlink(struct btree_iter *iter)
+{
+ struct btree_iter *linked;
+
+ __bch2_btree_iter_unlock(iter);
+
+ if (!btree_iter_linked(iter))
+ return;
+
+ for_each_linked_btree_iter(iter, linked) {
+
+ if (linked->next == iter) {
+ linked->next = iter->next;
+ return;
+ }
+ }
+
+ BUG();
+}
+
void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
{
BUG_ON(btree_iter_linked(new));
void bch2_btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
{
- bch2_btree_iter_unlock(dst);
+ __bch2_btree_iter_unlock(dst);
memcpy(dst, src, offsetof(struct btree_iter, next));
dst->nodes_locked = dst->nodes_intent_locked = 0;
}
}
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
+void bch2_btree_iter_unlink(struct btree_iter *);
void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
static inline struct bpos btree_type_successor(enum btree_id id,
*/
struct list_head write_blocked;
+ /*
+ * Also for asynchronous splits/interior node updates:
+ * If a btree node isn't reachable yet, we don't want to kick off
+ * another write - because that write also won't yet be reachable and
+ * marking it as completed before it's reachable would be incorrect:
+ */
+ struct list_head reachable;
+
struct open_bucket *ob;
/* lru list */
BTREE_NODE_read_error,
BTREE_NODE_write_error,
BTREE_NODE_dirty,
+ BTREE_NODE_need_write,
BTREE_NODE_noevict,
BTREE_NODE_write_idx,
BTREE_NODE_accessed,
BTREE_FLAG(read_error);
BTREE_FLAG(write_error);
BTREE_FLAG(dirty);
+BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
trace_btree_node_free(c, b);
BUG_ON(btree_node_dirty(b));
+ BUG_ON(btree_node_need_write(b));
BUG_ON(b == btree_node_root(c, b));
BUG_ON(b->ob);
BUG_ON(!list_empty(&b->write_blocked));
+ BUG_ON(!list_empty(&b->reachable));
clear_btree_node_noevict(b);
unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes;
return __bch2_btree_reserve_get(c, nr_nodes, flags, cl);
-
}
int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
struct closure cl;
struct btree_reserve *reserve;
struct btree *b;
+ LIST_HEAD(reachable_list);
closure_init_stack(&cl);
}
b = __btree_root_alloc(c, 0, id, reserve);
+ list_add(&b->reachable, &reachable_list);
bch2_btree_node_write(c, b, writes, SIX_LOCK_intent);
bch2_btree_set_root_initial(c, b, reserve);
bch2_btree_open_bucket_put(c, b);
+
+ list_del_init(&b->reachable);
six_unlock_intent(&b->lock);
bch2_btree_reserve_put(c, reserve);
bch2_btree_bset_insert_key(iter, b, node_iter, insert);
set_btree_node_dirty(b);
+ set_btree_node_need_write(b);
}
/* Inserting into a given leaf node (last stage of insert): */
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
- /*
- * have a bug where we're seeing an extent with an invalid crc
- * entry in the journal, trying to track it down:
- */
- BUG_ON(bch2_bkey_invalid(c, b->btree_id, bkey_i_to_s_c(insert)));
-
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
closure_init(&as->cl, &c->cl);
as->c = c;
as->mode = BTREE_INTERIOR_NO_UPDATE;
+ INIT_LIST_HEAD(&as->write_blocked_list);
+ INIT_LIST_HEAD(&as->reachable_list);
bch2_keylist_init(&as->parent_keys, as->inline_keys,
ARRAY_SIZE(as->inline_keys));
mutex_lock(&c->btree_interior_update_lock);
+ while (!list_empty(&as->reachable_list)) {
+ struct btree *b = list_first_entry(&as->reachable_list,
+ struct btree, reachable);
+ list_del_init(&b->reachable);
+ mutex_unlock(&c->btree_interior_update_lock);
+
+ six_lock_read(&b->lock);
+ bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b));
+ six_unlock_read(&b->lock);
+ mutex_lock(&c->btree_interior_update_lock);
+ }
+
for (i = 0; i < as->nr_pending; i++)
bch2_btree_node_free_ondisk(c, &as->pending[i]);
as->nr_pending = 0;
if (bch2_journal_error(&c->journal)) {
/* XXX what? */
+ /* we don't want to free the nodes on disk, that's what */
}
/* XXX: missing error handling, damnit */
list_del(&as->write_blocked_list);
mutex_unlock(&c->btree_interior_update_lock);
- bch2_btree_node_write_dirty(c, b, NULL, true);
+ bch2_btree_node_write_dirty(c, b, NULL,
+ btree_node_need_write(b));
six_unlock_read(&b->lock);
break;
}
clear_btree_node_dirty(b);
+ clear_btree_node_need_write(b);
w = btree_current_write(b);
llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
&as->journal, interior_update_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
+ if (!list_empty(&b->reachable))
+ list_del_init(&b->reachable);
mutex_unlock(&c->btree_interior_update_lock);
}
* node)
*/
static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1,
- struct btree_reserve *reserve)
+ struct btree_reserve *reserve,
+ struct btree_interior_update *as)
{
size_t nr_packed = 0, nr_unpacked = 0;
struct btree *n2;
struct bkey_packed *k, *prev = NULL;
n2 = bch2_btree_node_alloc(iter->c, n1->level, iter->btree_id, reserve);
+ list_add(&n2->reachable, &as->reachable_list);
+
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
n2->key.k.p = n1->key.k.p;
bch2_btree_interior_update_will_free_node(c, as, b);
n1 = bch2_btree_node_alloc_replacement(c, b, reserve);
+ list_add(&n1->reachable, &as->reachable_list);
+
if (b->level)
btree_split_insert_keys(iter, n1, insert_keys, reserve);
if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_node_split(c, b, b->nr.live_u64s);
- n2 = __btree_split_node(iter, n1, reserve);
+ n2 = __btree_split_node(iter, n1, reserve, as);
bch2_btree_build_aux_trees(n2);
bch2_btree_build_aux_trees(n1);
n3 = __btree_root_alloc(c, b->level + 1,
iter->btree_id,
reserve);
+ list_add(&n3->reachable, &as->reachable_list);
+
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
bch2_btree_interior_update_will_free_node(c, as, m);
n = bch2_btree_node_alloc(c, b->level, b->btree_id, reserve);
+ list_add(&n->reachable, &as->reachable_list);
+
n->data->min_key = prev->data->min_key;
n->data->max_key = next->data->max_key;
n->data->format = new_f;
int ret;
trans_for_each_entry(trans, i) {
- EBUG_ON(i->iter->level);
- EBUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+ BUG_ON(i->iter->level);
+ BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
}
sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
goto out;
}
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_i k;
+
+ bkey_init(&k.k);
+ k.k.p = iter->pos;
+
+ return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|flags,
+ BTREE_INSERT_ENTRY(iter, &k));
+}
+
int bch2_btree_insert_list_at(struct btree_iter *iter,
struct keylist *keys,
struct disk_reservation *disk_res,
return 0;
}
-/**
- * bch_btree_insert_check_key - insert dummy key into btree
- *
- * We insert a random key on a cache miss, then compare exchange on it
- * once the cache promotion or backing device read completes. This
- * ensures that if this key is written to after the read, the read will
- * lose and not overwrite the key with stale data.
- *
- * Return values:
- * -EAGAIN: @iter->cl was put on a waitlist waiting for btree node allocation
- * -EINTR: btree node was changed while upgrading to write lock
- */
-int bch2_btree_insert_check_key(struct btree_iter *iter,
- struct bkey_i *check_key)
-{
- struct bpos saved_pos = iter->pos;
- struct bkey_i_cookie *cookie;
- BKEY_PADDED(key) tmp;
- int ret;
-
- BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&check_key->k)));
-
- check_key->k.type = KEY_TYPE_COOKIE;
- set_bkey_val_bytes(&check_key->k, sizeof(struct bch_cookie));
-
- cookie = bkey_i_to_cookie(check_key);
- get_random_bytes(&cookie->v, sizeof(cookie->v));
-
- bkey_copy(&tmp.key, check_key);
-
- ret = bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(iter, &tmp.key));
-
- bch2_btree_iter_rewind(iter, saved_pos);
-
- return ret;
-}
-
/**
* bch_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs
bch2_btree_interior_update_will_free_node(c, as, b);
n = bch2_btree_node_alloc_replacement(c, b, reserve);
+ list_add(&n->reachable, &as->reachable_list);
bch2_btree_build_aux_trees(n);
six_unlock_write(&n->lock);
*/
struct btree_interior_update {
struct closure cl;
- struct bch_fs *c;
+ struct bch_fs *c;
struct list_head list;
*/
struct btree *b;
struct list_head write_blocked_list;
+ struct list_head reachable_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
int __bch2_btree_insert_at(struct btree_insert *);
-
#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N
#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1)
*/
#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3)
+int bch2_btree_delete_at(struct btree_iter *, unsigned);
+
int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
struct disk_reservation *,
struct extent_insert_hook *, u64 *, unsigned);
return u64s <= trans->journal_res.u64s;
}
-int bch2_btree_insert_check_key(struct btree_iter *, struct bkey_i *);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *,
struct extent_insert_hook *, u64 *, int flags);
#include "debug.h"
#include "error.h"
#include "extents.h"
-#include "fs-gc.h"
+#include "fsck.h"
#include "inode.h"
#include "io.h"
#include "super.h"
return len;
}
+static unsigned dirent_val_u64s(unsigned len)
+{
+ return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
+}
+
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
const struct qstr *name)
{
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
}
-static const struct bch_hash_desc dirent_hash_desc = {
+const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_DIRENTS,
.key_type = BCH_DIRENT,
.whiteout_type = BCH_DIRENT_WHITEOUT,
static const char *bch2_dirent_invalid(const struct bch_fs *c,
struct bkey_s_c k)
{
+ struct bkey_s_c_dirent d;
+ unsigned len;
+
switch (k.k->type) {
case BCH_DIRENT:
- return bkey_val_bytes(k.k) < sizeof(struct bch_dirent)
- ? "value too small"
- : NULL;
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
+ return "value too small";
+
+ d = bkey_s_c_to_dirent(k);
+ len = bch2_dirent_name_bytes(d);
+
+ if (!len)
+ return "empty name";
+
+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
+ return "value too big";
+
+ if (len > NAME_MAX)
+ return "dirent name too big";
+ if (memchr(d.v->d_name, '/', len))
+ return "dirent name has invalid characters";
+
+ return NULL;
case BCH_DIRENT_WHITEOUT:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
size_t size, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
+ size_t n = 0;
switch (k.k->type) {
case BCH_DIRENT:
d = bkey_s_c_to_dirent(k);
- if (size) {
- unsigned n = min_t(unsigned, size,
- bch2_dirent_name_bytes(d));
- memcpy(buf, d.v->d_name, n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
- scnprintf(buf, size, " -> %llu", d.v->d_inum);
+ n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
+ bch2_dirent_name_bytes(d));
+ n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
break;
case BCH_DIRENT_WHITEOUT:
scnprintf(buf, size, "whiteout");
const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s +
- DIV_ROUND_UP(sizeof(struct bch_dirent) + name->len,
- sizeof(u64));
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
if (!dirent)
if (!dirent)
return -ENOMEM;
- ret = bch2_hash_set(dirent_hash_desc, hash_info, c, dir_inum,
+ ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
journal_seq, &dirent->k_i, flags);
kfree(dirent);
* from the original hashed position (like we do when creating dirents,
* in bch_hash_set) - we never move existing dirents to different slot:
*/
- old_src = bch2_hash_lookup_at(dirent_hash_desc,
+ old_src = bch2_hash_lookup_at(bch2_dirent_hash_desc,
&src_ei->str_hash,
&src_iter, src_name);
if ((ret = btree_iter_err(old_src)))
goto err;
- ret = bch2_hash_needs_whiteout(dirent_hash_desc,
+ ret = bch2_hash_needs_whiteout(bch2_dirent_hash_desc,
&src_ei->str_hash,
&whiteout_iter, &src_iter);
if (ret < 0)
* to do that check for us for correctness:
*/
old_dst = mode == BCH_RENAME
- ? bch2_hash_hole_at(dirent_hash_desc, &dst_iter)
- : bch2_hash_lookup_at(dirent_hash_desc,
+ ? bch2_hash_hole_at(bch2_dirent_hash_desc, &dst_iter)
+ : bch2_hash_lookup_at(bch2_dirent_hash_desc,
&dst_ei->str_hash,
&dst_iter, dst_name);
if ((ret = btree_iter_err(old_dst)))
const struct qstr *name,
u64 *journal_seq)
{
- return bch2_hash_delete(dirent_hash_desc, hash_info,
+ return bch2_hash_delete(bch2_dirent_hash_desc, hash_info,
c, dir_inum, journal_seq, name);
}
struct bkey_s_c k;
u64 inum;
- k = bch2_hash_lookup(dirent_hash_desc, hash_info, c,
+ k = bch2_hash_lookup(bch2_dirent_hash_desc, hash_info, c,
dir_inum, &iter, name);
if (IS_ERR(k.k)) {
bch2_btree_iter_unlock(&iter);
#ifndef _BCACHE_DIRENT_H
#define _BCACHE_DIRENT_H
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
extern const struct bkey_ops bch2_bkey_dirent_ops;
struct qstr;
{
queue_work(system_long_wq, &ca->io_error_work);
}
+
+#ifdef __KERNEL__
+#define ask_yn() false
+#else
+#include "tools-util.h"
+#endif
+
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
+ const char *fmt, ...)
+{
+ struct fsck_err_state *s;
+ va_list args;
+ bool fix = false, print = true, suppressing = false;
+ char _buf[sizeof(s->buf)], *buf = _buf;
+
+ mutex_lock(&c->fsck_error_lock);
+
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ goto print;
+
+ list_for_each_entry(s, &c->fsck_errors, list)
+ if (s->fmt == fmt)
+ goto found;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s) {
+ if (!c->fsck_alloc_err)
+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
+ c->fsck_alloc_err = true;
+ buf = _buf;
+ goto print;
+ }
+
+ INIT_LIST_HEAD(&s->list);
+ s->fmt = fmt;
+found:
+ list_move(&s->list, &c->fsck_errors);
+ s->nr++;
+ suppressing = s->nr == 10;
+ print = s->nr <= 10;
+ buf = s->buf;
+print:
+ va_start(args, fmt);
+ vscnprintf(buf, sizeof(_buf), fmt, args);
+ va_end(args);
+
+ if (flags & FSCK_CAN_FIX) {
+ if (c->opts.fix_errors == FSCK_ERR_ASK) {
+ printk(KERN_ERR "%s: fix?", buf);
+ fix = ask_yn();
+ } else if (c->opts.fix_errors == FSCK_ERR_YES ||
+ (c->opts.nochanges &&
+ !(flags & FSCK_CAN_IGNORE))) {
+ if (print)
+ bch_err(c, "%s, fixing", buf);
+ fix = true;
+ } else {
+ if (print)
+ bch_err(c, "%s, not fixing", buf);
+ fix = false;
+ }
+ } else if (flags & FSCK_NEED_FSCK) {
+ if (print)
+ bch_err(c, "%s (run fsck to correct)", buf);
+ } else {
+ if (print)
+ bch_err(c, "%s (repair unimplemented)", buf);
+ }
+
+ if (suppressing)
+ bch_err(c, "Ratelimiting new instances of previous error");
+
+ mutex_unlock(&c->fsck_error_lock);
+
+ if (fix)
+ set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
+
+ return fix ? FSCK_ERR_FIX
+ : flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE
+ : FSCK_ERR_EXIT;
+}
+
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+ struct fsck_err_state *s, *n;
+
+ mutex_lock(&c->fsck_error_lock);
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+ if (s->nr > 10)
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
+
+ list_del(&s->list);
+ kfree(s);
+ }
+
+ mutex_unlock(&c->fsck_error_lock);
+}
BCH_FSCK_UNKNOWN_VERSION = 4,
};
-/* These macros return true if error should be fixed: */
-
-/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
-
enum fsck_err_opts {
FSCK_ERR_NO,
FSCK_ERR_YES,
FSCK_ERR_ASK,
};
-#ifdef __KERNEL__
-#define __fsck_err_should_fix(c, msg, ...) \
-({ \
- bool _fix = (c)->opts.fix_errors; \
- bch_err(c, msg ", %sfixing", ##__VA_ARGS__, _fix ? "" : "not ");\
- _fix; \
-})
-#else
-#include "tools-util.h"
+enum fsck_err_ret {
+ FSCK_ERR_IGNORE = 0,
+ FSCK_ERR_FIX = 1,
+ FSCK_ERR_EXIT = 2,
+};
-#define __fsck_err_should_fix(c, msg, ...) \
-({ \
- bool _fix = false; \
- switch ((c)->opts.fix_errors) { \
- case FSCK_ERR_ASK: \
- printf(msg ": fix?", ##__VA_ARGS__); \
- _fix = ask_yn(); \
- break; \
- case FSCK_ERR_YES: \
- bch_err(c, msg ", fixing", ##__VA_ARGS__); \
- _fix = true; \
- break; \
- case FSCK_ERR_NO: \
- bch_err(c, msg, ##__VA_ARGS__); \
- _fix = false; \
- break; \
- } \
- _fix; \
-})
-#endif
+struct fsck_err_state {
+ struct list_head list;
+ const char *fmt;
+ u64 nr;
+ char buf[512];
+};
+
+#define FSCK_CAN_FIX (1 << 0)
+#define FSCK_CAN_IGNORE (1 << 1)
+#define FSCK_NEED_FSCK (1 << 2)
-#define __fsck_err(c, _can_fix, _can_ignore, _nofix_msg, msg, ...) \
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
+ unsigned, const char *, ...);
+void bch2_flush_fsck_errs(struct bch_fs *);
+
+#define __fsck_err(c, _flags, msg, ...) \
({ \
- bool _fix; \
- \
- if (_can_fix) { \
- _fix = __fsck_err_should_fix(c, msg, ##__VA_ARGS__); \
- } else { \
- bch_err(c, msg " ("_nofix_msg")", ##__VA_ARGS__); \
- _fix = false; \
- } \
+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
\
- if (_fix) \
- set_bit(BCH_FS_FSCK_FIXED_ERRORS, &(c)->flags); \
- \
- if (!_fix && !_can_ignore) { \
+ if (_fix == FSCK_ERR_EXIT) { \
bch_err(c, "Unable to continue, halting"); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
_fix; \
})
-#define __fsck_err_on(cond, c, _can_fix, _can_ignore, _nofix_msg, ...) \
- ((cond) ? __fsck_err(c, _can_fix, _can_ignore, \
- _nofix_msg, ##__VA_ARGS__) : false)
+/* These macros return true if error should be fixed: */
+
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
+
+#define __fsck_err_on(cond, c, _flags, ...) \
+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define unfixable_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, false, true, "repair unimplemented", ##__VA_ARGS__)
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE, ##__VA_ARGS__)
#define need_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, false, true, "run fsck to correct", ##__VA_ARGS__)
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define mustfix_fsck_err(c, ...) \
- __fsck_err(c, true, false, "not fixing", ##__VA_ARGS__)
+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, true, false, "not fixing", ##__VA_ARGS__)
+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define fsck_err_on(cond, c, ...) \
- __fsck_err_on(cond, c, true, true, "not fixing", ##__VA_ARGS__)
+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
/*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
#include "clock.h"
#include "error.h"
#include "fs.h"
-#include "fs-gc.h"
#include "fs-io.h"
+#include "fsck.h"
#include "inode.h"
#include "journal.h"
#include "io.h"
#include "dirent.h"
#include "extents.h"
#include "fs.h"
-#include "fs-gc.h"
#include "fs-io.h"
+#include "fsck.h"
#include "inode.h"
#include "journal.h"
#include "keylist.h"
#include "dirent.h"
#include "error.h"
#include "fs.h"
-#include "fs-gc.h"
+#include "fsck.h"
#include "inode.h"
#include "keylist.h"
#include "super.h"
+#include "xattr.h"
#include <linux/dcache.h> /* struct qstr */
#include <linux/generic-radix-tree.h>
bch2_btree_iter_unlock(iter);
ret = bch2_inode_find_by_inum(c, dir_inum, &dir_inode);
- if (ret)
+ if (ret) {
+ bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
goto err;
+ }
dir_hash_info = bch2_hash_info_init(c, &dir_inode);
ret = bch2_dirent_delete(c, dir_inum, &dir_hash_info, &name, NULL);
+ if (ret)
+ bch_err(c, "remove_dirent: err %i deleting dirent", ret);
err:
kfree(buf);
return ret;
return 0;
}
+struct hash_check {
+ struct bch_hash_info info;
+ struct btree_iter chain;
+ struct btree_iter iter;
+ u64 next;
+};
+
+static void hash_check_init(const struct bch_hash_desc desc,
+ struct hash_check *h, struct bch_fs *c)
+{
+ bch2_btree_iter_init(&h->chain, c, desc.btree_id, POS_MIN);
+ bch2_btree_iter_init(&h->iter, c, desc.btree_id, POS_MIN);
+}
+
+static void hash_check_set_inode(struct hash_check *h, struct bch_fs *c,
+ const struct bch_inode_unpacked *bi)
+{
+ h->info = bch2_hash_info_init(c, bi);
+ h->next = -1;
+}
+
+static int hash_redo_key(const struct bch_hash_desc desc,
+ struct hash_check *h, struct bch_fs *c,
+ struct btree_iter *k_iter, struct bkey_s_c k,
+ u64 hashed)
+{
+ struct bkey_i *tmp;
+ int ret = 0;
+
+ tmp = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ bkey_reassemble(tmp, k);
+
+ ret = bch2_btree_delete_at(k_iter, 0);
+ if (ret)
+ goto err;
+
+ bch2_btree_iter_unlock(k_iter);
+
+ bch2_hash_set(desc, &h->info, c, k_iter->pos.inode, NULL,
+ tmp, BCH_HASH_SET_MUST_CREATE);
+err:
+ kfree(tmp);
+ return ret;
+}
+
+static int hash_check_key(const struct bch_hash_desc desc,
+ struct hash_check *h, struct bch_fs *c,
+ struct btree_iter *k_iter, struct bkey_s_c k)
+{
+ char buf[200];
+ u64 hashed;
+ int ret = 0;
+
+ if (k.k->type != desc.whiteout_type &&
+ k.k->type != desc.key_type)
+ return 0;
+
+ if (k.k->p.offset != h->next) {
+ if (!btree_iter_linked(&h->chain)) {
+ bch2_btree_iter_link(k_iter, &h->chain);
+ bch2_btree_iter_link(k_iter, &h->iter);
+ }
+ bch2_btree_iter_copy(&h->chain, k_iter);
+ }
+ h->next = k.k->p.offset + 1;
+
+ if (k.k->type != desc.key_type)
+ return 0;
+
+ hashed = desc.hash_bkey(&h->info, k);
+
+ if (fsck_err_on(hashed < h->chain.pos.offset ||
+ hashed > k.k->p.offset, c,
+ "hash table key at wrong offset: %llu, "
+ "hashed to %llu chain starts at %llu\n%s",
+ k.k->p.offset, hashed, h->chain.pos.offset,
+ bch2_bkey_val_to_text(c, desc.btree_id,
+ buf, sizeof(buf), k))) {
+ ret = hash_redo_key(desc, h, c, k_iter, k, hashed);
+ if (ret) {
+ bch_err(c, "hash_redo_key err %i", ret);
+ return ret;
+ }
+ return 1;
+ }
+
+ if (!bkey_cmp(h->chain.pos, k_iter->pos))
+ return 0;
+
+ bch2_btree_iter_copy(&h->iter, &h->chain);
+ while (bkey_cmp(h->iter.pos, k_iter->pos) < 0) {
+ struct bkey_s_c k2 = bch2_btree_iter_peek(&h->iter);
+
+ if (fsck_err_on(k2.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, k2), c,
+ "duplicate hash table keys:\n%s",
+ bch2_bkey_val_to_text(c, desc.btree_id,
+ buf, sizeof(buf), k))) {
+ ret = bch2_hash_delete_at(desc, &h->info, &h->iter, NULL);
+ if (ret)
+ return ret;
+ return 1;
+ }
+ bch2_btree_iter_advance_pos(&h->iter);
+ }
+fsck_err:
+ return ret;
+}
+
/*
* Walk extents: verify that extents have a corresponding S_ISREG inode, and
* that i_size an i_sectors are consistent
if (ret)
break;
- unfixable_fsck_err_on(!w.have_inode, c,
+ if (fsck_err_on(!w.have_inode, c,
"extent type %u for missing inode %llu",
- k.k->type, k.k->p.inode);
-
- unfixable_fsck_err_on(w.have_inode &&
+ k.k->type, k.k->p.inode) ||
+ fsck_err_on(w.have_inode &&
!S_ISREG(w.inode.i_mode) && !S_ISLNK(w.inode.i_mode), c,
"extent type %u for non regular file, inode %llu mode %o",
- k.k->type, k.k->p.inode, w.inode.i_mode);
+ k.k->type, k.k->p.inode, w.inode.i_mode)) {
+ ret = bch2_btree_delete_at(&iter, 0);
+ if (ret)
+ goto err;
+ continue;
+ }
unfixable_fsck_err_on(w.first_this_inode &&
w.have_inode &&
"extent type %u offset %llu past end of inode %llu, i_size %llu",
k.k->type, k.k->p.offset, k.k->p.inode, w.inode.i_size);
}
+err:
fsck_err:
return bch2_btree_iter_unlock(&iter) ?: ret;
}
static int check_dirents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
+ struct hash_check h;
struct btree_iter iter;
struct bkey_s_c k;
+ unsigned name_len;
+ char buf[200];
int ret = 0;
+ hash_check_init(bch2_dirent_hash_desc, &h, c);
+
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
POS(BCACHE_ROOT_INO, 0), k) {
struct bkey_s_c_dirent d;
if (ret)
break;
- unfixable_fsck_err_on(!w.have_inode, c,
- "dirent in nonexisting directory %llu",
- k.k->p.inode);
+ if (fsck_err_on(!w.have_inode, c,
+ "dirent in nonexisting directory:\n%s",
+ bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+ buf, sizeof(buf), k)) ||
+ fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
+ "dirent in non directory inode type %u:\n%s",
+ mode_to_type(w.inode.i_mode),
+ bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+ buf, sizeof(buf), k))) {
+ ret = bch2_btree_delete_at(&iter, 0);
+ if (ret)
+ goto err;
+ continue;
+ }
+
+ if (w.first_this_inode && w.have_inode)
+ hash_check_set_inode(&h, c, &w.inode);
+
+ ret = hash_check_key(bch2_dirent_hash_desc, &h, c, &iter, k);
+ if (ret > 0) {
+ ret = 0;
+ continue;
+ }
- unfixable_fsck_err_on(!S_ISDIR(w.inode.i_mode), c,
- "dirent in non directory inode %llu, type %u",
- k.k->p.inode, mode_to_type(w.inode.i_mode));
+ if (ret)
+ goto fsck_err;
if (k.k->type != BCH_DIRENT)
continue;
d = bkey_s_c_to_dirent(k);
d_inum = le64_to_cpu(d.v->d_inum);
+ name_len = bch2_dirent_name_bytes(d);
+
+ if (fsck_err_on(!name_len, c, "empty dirent") ||
+ fsck_err_on(name_len == 1 &&
+ !memcmp(d.v->d_name, ".", 1), c,
+ ". dirent") ||
+ fsck_err_on(name_len == 2 &&
+ !memcmp(d.v->d_name, "..", 2), c,
+ ".. dirent")) {
+ ret = remove_dirent(c, &iter, d);
+ if (ret)
+ goto err;
+ continue;
+ }
+
if (fsck_err_on(d_inum == d.k->p.inode, c,
- "dirent points to own directory")) {
+ "dirent points to own directory:\n%s",
+ bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+ buf, sizeof(buf), k))) {
ret = remove_dirent(c, &iter, d);
if (ret)
goto err;
ret = 0;
if (fsck_err_on(!have_target, c,
- "dirent points to missing inode %llu, type %u filename %s",
- d_inum, d.v->d_type, d.v->d_name)) {
+ "dirent points to missing inode:\n%s",
+ bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+ buf, sizeof(buf), k))) {
ret = remove_dirent(c, &iter, d);
if (ret)
goto err;
if (fsck_err_on(have_target &&
d.v->d_type !=
mode_to_type(le16_to_cpu(target.i_mode)), c,
- "incorrect d_type: got %u should be %u, filename %s",
- d.v->d_type,
+ "incorrect d_type: should be %u:\n%s",
mode_to_type(le16_to_cpu(target.i_mode)),
- d.v->d_name)) {
+ bch2_bkey_val_to_text(c, BTREE_ID_DIRENTS,
+ buf, sizeof(buf), k))) {
struct bkey_i_dirent *n;
n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
}
err:
fsck_err:
+ bch2_btree_iter_unlock(&h.chain);
+ bch2_btree_iter_unlock(&h.iter);
return bch2_btree_iter_unlock(&iter) ?: ret;
}
static int check_xattrs(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
+ struct hash_check h;
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
+ hash_check_init(bch2_xattr_hash_desc, &h, c);
+
for_each_btree_key(&iter, c, BTREE_ID_XATTRS,
POS(BCACHE_ROOT_INO, 0), k) {
ret = walk_inode(c, &w, k.k->p.inode);
if (ret)
break;
- unfixable_fsck_err_on(!w.have_inode, c,
- "xattr for missing inode %llu",
- k.k->p.inode);
+ if (fsck_err_on(!w.have_inode, c,
+ "xattr for missing inode %llu",
+ k.k->p.inode)) {
+ ret = bch2_btree_delete_at(&iter, 0);
+ if (ret)
+ goto err;
+ continue;
+ }
+
+ if (w.first_this_inode && w.have_inode)
+ hash_check_set_inode(&h, c, &w.inode);
+
+ ret = hash_check_key(bch2_xattr_hash_desc, &h, c, &iter, k);
+ if (ret)
+ goto fsck_err;
}
+err:
fsck_err:
+ bch2_btree_iter_unlock(&h.chain);
+ bch2_btree_iter_unlock(&h.iter);
return bch2_btree_iter_unlock(&iter) ?: ret;
}
/* DFS: */
restart_dfs:
+ had_unreachable = false;
+
ret = inode_bitmap_set(&dirs_done, BCACHE_ROOT_INO);
if (ret)
goto err;
d_inum = le64_to_cpu(dirent.v->d_inum);
if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
- "directory with multiple hardlinks")) {
+ "directory %llu has multiple hardlinks",
+ d_inum)) {
ret = remove_dirent(c, &iter, dirent);
if (ret)
goto err;
path.nr--;
}
- had_unreachable = false;
-
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
if (k.k->type != BCH_INODE_FS ||
!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->i_mode)))
ret = bch2_inode_unpack(inode, &u);
if (bch2_fs_inconsistent_on(ret, c,
- "error unpacking inode %llu in fs-gc",
+ "error unpacking inode %llu in fsck",
inode.k->p.inode))
return ret;
struct bch_inode_unpacked root_inode, lostfound_inode;
int ret;
- ret = check_root(c, &root_inode);
- if (ret)
- return ret;
+ if (full_fsck) {
+ bch_verbose(c, "checking extents");
+ ret = check_extents(c);
+ if (ret)
+ return ret;
- ret = check_lostfound(c, &root_inode, &lostfound_inode);
- if (ret)
- return ret;
+ bch_verbose(c, "checking dirents");
+ ret = check_dirents(c);
+ if (ret)
+ return ret;
- if (!full_fsck)
- goto check_nlinks;
+ bch_verbose(c, "checking xattrs");
+ ret = check_xattrs(c);
+ if (ret)
+ return ret;
- ret = check_extents(c);
- if (ret)
- return ret;
+ bch_verbose(c, "checking root directory");
+ ret = check_root(c, &root_inode);
+ if (ret)
+ return ret;
- ret = check_dirents(c);
- if (ret)
- return ret;
+ bch_verbose(c, "checking lost+found");
+ ret = check_lostfound(c, &root_inode, &lostfound_inode);
+ if (ret)
+ return ret;
- ret = check_xattrs(c);
- if (ret)
- return ret;
+ bch_verbose(c, "checking directory structure");
+ ret = check_directory_structure(c, &lostfound_inode);
+ if (ret)
+ return ret;
- ret = check_directory_structure(c, &lostfound_inode);
- if (ret)
- return ret;
-check_nlinks:
- ret = check_inode_nlinks(c, &lostfound_inode);
- if (ret)
- return ret;
+ bch_verbose(c, "checking inode nlinks");
+ ret = check_inode_nlinks(c, &lostfound_inode);
+ if (ret)
+ return ret;
+ } else {
+ bch_verbose(c, "checking root directory");
+ ret = check_root(c, &root_inode);
+ if (ret)
+ return ret;
+
+ bch_verbose(c, "checking lost+found");
+ ret = check_lostfound(c, &root_inode, &lostfound_inode);
+ if (ret)
+ return ret;
+
+ bch_verbose(c, "checking inode nlinks");
+ ret = check_inode_nlinks(c, &lostfound_inode);
+ if (ret)
+ return ret;
+ }
+
+ bch2_flush_fsck_errs(c);
return 0;
}
13 * 8 - 8,
};
-static int inode_encode_field(u8 *out, u8 *end, const u64 in[2])
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
{
- unsigned bytes, bits, shift;
-
- if (likely(!in[1]))
- bits = fls64(in[0]);
- else
- bits = fls64(in[1]) + 64;
+ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
+ unsigned shift, bytes, bits = likely(!hi)
+ ? fls64(lo)
+ : fls64(hi) + 64;
for (shift = 1; shift <= 8; shift++)
if (bits < bits_table[shift - 1])
BUG_ON(out + bytes > end);
- if (likely(bytes <= 8)) {
- u64 b = cpu_to_be64(in[0]);
-
- memcpy(out, (void *) &b + 8 - bytes, bytes);
- } else {
- u64 b = cpu_to_be64(in[1]);
-
- memcpy(out, (void *) &b + 16 - bytes, bytes);
- put_unaligned_be64(in[0], out + bytes - 8);
- }
-
+ memcpy(out, (u8 *) in + 16 - bytes, bytes);
*out |= (1 << 8) >> shift;
return bytes;
static int inode_decode_field(const u8 *in, const u8 *end,
u64 out[2], unsigned *out_bits)
{
- unsigned bytes, bits, shift;
+ __be64 be[2] = { 0, 0 };
+ unsigned bytes, shift;
+ u8 *p;
if (in >= end)
return -1;
*/
shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
bytes = byte_table[shift - 1];
- bits = bytes * 8 - shift;
if (in + bytes > end)
return -1;
- /*
- * we're assuming it's safe to deref up to 7 bytes < in; this will work
- * because keys always start quite a bit more than 7 bytes after the
- * start of the btree node header:
- */
- if (likely(bytes <= 8)) {
- out[0] = get_unaligned_be64(in + bytes - 8);
- out[0] <<= 64 - bits;
- out[0] >>= 64 - bits;
- out[1] = 0;
- } else {
- out[0] = get_unaligned_be64(in + bytes - 8);
- out[1] = get_unaligned_be64(in + bytes - 16);
- out[1] <<= 128 - bits;
- out[1] >>= 128 - bits;
- }
+ p = (u8 *) be + 16 - bytes;
+ memcpy(p, in, bytes);
+ *p ^= (1 << 8) >> shift;
+
+ out[0] = be64_to_cpu(be[0]);
+ out[1] = be64_to_cpu(be[1]);
+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
- *out_bits = out[1] ? 64 + fls64(out[1]) : fls64(out[0]);
return bytes;
}
u8 *out = packed->inode.v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
- u64 field[2];
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
bkey_inode_init(&packed->inode.k_i);
packed->inode.v.i_mode = cpu_to_le16(inode->i_mode);
#define BCH_INODE_FIELD(_name, _bits) \
- field[0] = inode->_name; \
- field[1] = 0; \
- out += inode_encode_field(out, end, field); \
+ out += inode_encode_field(out, end, 0, inode->_name); \
nr_fields++; \
\
- if (field[0] | field[1]) { \
+ if (inode->_name) { \
last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \
}
if (field_bits > sizeof(unpacked->_name) * 8) \
return -1; \
\
- unpacked->_name = field[0]; \
+ unpacked->_name = field[1]; \
in += ret;
BCH_INODE_FIELDS()
bch2_btree_iter_unlock(&iter);
return -ENOENT;
}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void)
+{
+ struct bch_inode_unpacked *u, test_inodes[] = {
+ {
+ .i_atime = U64_MAX,
+ .i_ctime = U64_MAX,
+ .i_mtime = U64_MAX,
+ .i_otime = U64_MAX,
+ .i_size = U64_MAX,
+ .i_sectors = U64_MAX,
+ .i_uid = U32_MAX,
+ .i_gid = U32_MAX,
+ .i_nlink = U32_MAX,
+ .i_generation = U32_MAX,
+ .i_dev = U32_MAX,
+ },
+ };
+
+ for (u = test_inodes;
+ u < test_inodes + ARRAY_SIZE(test_inodes);
+ u++) {
+ struct bkey_inode_buf p;
+
+ bch2_inode_pack(&p, u);
+ }
+}
+#endif
return div_s64(ns, c->sb.time_precision);
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_inode_pack_test(void);
+#else
+static inline void bch2_inode_pack_test(void) {}
+#endif
+
#endif
bch2_encrypt_bio(c, rbio->crc.csum_type,
nonce, src);
- bio_copy_data_iter(dst, dst_iter,
- src, src->bi_iter);
+ bio_copy_data_iter(dst, &dst_iter,
+ src, &src->bi_iter);
} else {
bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
}
#define JOURNAL_ENTRY_NONE 6
#define JOURNAL_ENTRY_BAD 7
-static int journal_entry_validate(struct bch_fs *c,
- struct jset *j, u64 sector,
- unsigned bucket_sectors_left,
- unsigned sectors_read)
+#define journal_entry_err(c, msg, ...) \
+({ \
+ if (write == READ) { \
+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
+ } else { \
+ bch_err(c, "detected corrupt metadata before write:\n" \
+ msg, ##__VA_ARGS__); \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+ true; \
+})
+
+#define journal_entry_err_on(cond, c, msg, ...) \
+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int __journal_entry_validate(struct bch_fs *c, struct jset *j,
+ int write)
{
struct jset_entry *entry;
- size_t bytes = vstruct_bytes(j);
- struct bch_csum csum;
int ret = 0;
- if (le64_to_cpu(j->magic) != jset_magic(c))
- return JOURNAL_ENTRY_NONE;
-
- if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
- bch_err(c, "unknown journal entry version %u",
- le32_to_cpu(j->version));
- return BCH_FSCK_UNKNOWN_VERSION;
- }
-
- if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
- "journal entry too big (%zu bytes), sector %lluu",
- bytes, sector)) {
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- if (bytes > sectors_read << 9)
- return JOURNAL_ENTRY_REREAD;
-
- if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
- "journal entry with unknown csum type %llu sector %lluu",
- JSET_CSUM_TYPE(j), sector))
- return JOURNAL_ENTRY_BAD;
-
- csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
- if (mustfix_fsck_err_on(bch2_crc_cmp(csum, j->csum), c,
- "journal checksum bad, sector %llu", sector)) {
- /* XXX: retry IO, when we start retrying checksum errors */
- /* XXX: note we might have missing journal entries */
- return JOURNAL_ENTRY_BAD;
- }
-
- bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
- j->encrypted_start,
- vstruct_end(j) - (void *) j->encrypted_start);
-
- if (mustfix_fsck_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
- "invalid journal entry: last_seq > seq"))
- j->last_seq = j->seq;
-
vstruct_for_each(j, entry) {
struct bkey_i *k;
- if (mustfix_fsck_err_on(vstruct_next(entry) >
- vstruct_last(j), c,
- "journal entry extents past end of jset")) {
+ if (journal_entry_err_on(vstruct_next(entry) >
+ vstruct_last(j), c,
+ "journal entry extends past end of jset")) {
j->u64s = cpu_to_le64((u64 *) entry - j->_data);
break;
}
case JOURNAL_ENTRY_BTREE_ROOT:
k = entry->start;
- if (mustfix_fsck_err_on(!entry->u64s ||
+ if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s, c,
"invalid btree root journal entry: wrong number of keys")) {
journal_entry_null_range(entry,
break;
case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
- if (mustfix_fsck_err_on(le16_to_cpu(entry->u64s) != 1, c,
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry,
vstruct_next(entry));
break;
default:
- mustfix_fsck_err(c, "invalid journal entry type %llu",
+ journal_entry_err(c, "invalid journal entry type %llu",
JOURNAL_ENTRY_TYPE(entry));
journal_entry_null_range(entry, vstruct_next(entry));
break;
return ret;
}
+static int journal_entry_validate(struct bch_fs *c,
+ struct jset *j, u64 sector,
+ unsigned bucket_sectors_left,
+ unsigned sectors_read,
+ int write)
+{
+ size_t bytes = vstruct_bytes(j);
+ struct bch_csum csum;
+ int ret = 0;
+
+ if (le64_to_cpu(j->magic) != jset_magic(c))
+ return JOURNAL_ENTRY_NONE;
+
+ if (le32_to_cpu(j->version) != BCACHE_JSET_VERSION) {
+ bch_err(c, "unknown journal entry version %u",
+ le32_to_cpu(j->version));
+ return BCH_FSCK_UNKNOWN_VERSION;
+ }
+
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+ "journal entry too big (%zu bytes), sector %lluu",
+ bytes, sector)) {
+ /* XXX: note we might have missing journal entries */
+ return JOURNAL_ENTRY_BAD;
+ }
+
+ if (bytes > sectors_read << 9)
+ return JOURNAL_ENTRY_REREAD;
+
+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)), c,
+ "journal entry with unknown csum type %llu sector %lluu",
+ JSET_CSUM_TYPE(j), sector))
+ return JOURNAL_ENTRY_BAD;
+
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ if (journal_entry_err_on(bch2_crc_cmp(csum, j->csum), c,
+ "journal checksum bad, sector %llu", sector)) {
+ /* XXX: retry IO, when we start retrying checksum errors */
+ /* XXX: note we might have missing journal entries */
+ return JOURNAL_ENTRY_BAD;
+ }
+
+ bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
+ j->encrypted_start,
+ vstruct_end(j) - (void *) j->encrypted_start);
+
+ if (journal_entry_err_on(le64_to_cpu(j->last_seq) > le64_to_cpu(j->seq), c,
+ "invalid journal entry: last_seq > seq"))
+ j->last_seq = j->seq;
+
+ return __journal_entry_validate(c, j, write);
+fsck_err:
+ return ret;
+}
+
struct journal_read_buf {
void *data;
size_t size;
}
ret = journal_entry_validate(c, j, offset,
- end - offset, sectors_read);
+ end - offset, sectors_read,
+ READ);
switch (ret) {
case BCH_FSCK_OK:
break;
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+ __journal_entry_validate(c, jset, WRITE))
+ goto err;
+
bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
+ if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+ __journal_entry_validate(c, jset, WRITE))
+ goto err;
+
sectors = vstruct_sectors(jset, c->block_bits);
BUG_ON(sectors > j->prev_buf_sectors);
ptr->offset += sectors;
closure_return_with_destructor(cl, journal_write_done);
+err:
+ bch2_fatal_error(c);
+ closure_return_with_destructor(cl, journal_write_done);
}
static void journal_write_work(struct work_struct *work)
#define _BCACHE_STR_HASH_H
#include "btree_iter.h"
+#include "btree_update.h"
#include "checksum.h"
+#include "error.h"
#include "inode.h"
#include "siphash.h"
#include "super.h"
return ret;
}
+static inline int bch2_hash_delete_at(const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *iter,
+ u64 *journal_seq)
+{
+ struct btree_iter whiteout_iter;
+ struct bkey_i delete;
+ int ret = -ENOENT;
+
+ bch2_btree_iter_init(&whiteout_iter, iter->c, desc.btree_id,
+ iter->pos);
+ bch2_btree_iter_link(iter, &whiteout_iter);
+
+ ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, iter);
+ if (ret < 0)
+ goto err;
+
+ bkey_init(&delete.k);
+ delete.k.p = iter->pos;
+ delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
+
+ ret = bch2_btree_insert_at(iter->c, NULL, NULL, journal_seq,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_ENTRY(iter, &delete));
+err:
+ bch2_btree_iter_unlink(&whiteout_iter);
+ return ret;
+}
+
static inline int bch2_hash_delete(const struct bch_hash_desc desc,
const struct bch_hash_info *info,
struct bch_fs *c, u64 inode,
{
struct btree_iter iter, whiteout_iter;
struct bkey_s_c k;
- struct bkey_i delete;
int ret = -ENOENT;
bch2_btree_iter_init_intent(&iter, c, desc.btree_id,
if ((ret = btree_iter_err(k)))
goto err;
- ret = bch2_hash_needs_whiteout(desc, info, &whiteout_iter, &iter);
- if (ret < 0)
- goto err;
-
- bkey_init(&delete.k);
- delete.k.p = k.k->p;
- delete.k.type = ret ? desc.whiteout_type : KEY_TYPE_DELETED;
-
- ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_ATOMIC,
- BTREE_INSERT_ENTRY(&iter, &delete));
+ ret = bch2_hash_delete_at(desc, info, &iter, journal_seq);
err:
if (ret == -EINTR)
goto retry;
#include "debug.h"
#include "error.h"
#include "fs.h"
-#include "fs-gc.h"
+#include "fsck.h"
#include "inode.h"
#include "io.h"
#include "journal.h"
INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
mutex_init(&c->zlib_workspace_lock);
+ INIT_LIST_HEAD(&c->fsck_errors);
+ mutex_init(&c->fsck_error_lock);
+
seqcount_init(&c->gc_pos_lock);
c->prio_clock[READ].hand = 1;
switch (ret) {
case BCH_FSCK_ERRORS_NOT_FIXED:
bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("mount with -o fix_errors to repair");
+ pr_cont("mount with -o fix_errors to repair\n");
err = "fsck error";
break;
case BCH_FSCK_REPAIR_UNIMPLEMENTED:
bch_err(c, "filesystem contains errors: please report this to the developers");
- pr_cont("repair unimplemented: inform the developers so that it can be added");
+ pr_cont("repair unimplemented: inform the developers so that it can be added\n");
err = "fsck error";
break;
case BCH_FSCK_REPAIR_IMPOSSIBLE:
kvpfree(ca->disk_buckets, bucket_bytes(ca));
kfree(ca->prio_buckets);
kfree(ca->bio_prio);
- vfree(ca->buckets);
- vfree(ca->oldest_gens);
+ kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket));
+ kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
free_heap(&ca->heap);
free_fifo(&ca->free_inc);
!init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&ca->free_inc, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&ca->heap, heap_size, GFP_KERNEL) ||
- !(ca->oldest_gens = vzalloc(sizeof(u8) *
- ca->mi.nbuckets)) ||
- !(ca->buckets = vzalloc(sizeof(struct bucket) *
- ca->mi.nbuckets)) ||
+ !(ca->oldest_gens = kvpmalloc(ca->mi.nbuckets *
+ sizeof(u8),
+ GFP_KERNEL|__GFP_ZERO)) ||
+ !(ca->buckets = kvpmalloc(ca->mi.nbuckets *
+ sizeof(struct bucket),
+ GFP_KERNEL|__GFP_ZERO)) ||
!(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) *
2, GFP_KERNEL)) ||
!(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) ||
static int __init bcachefs_init(void)
{
bch2_bkey_pack_test();
+ bch2_inode_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
bch2_chardev_init() ||
{
struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
const struct bch_option *opt;
- enum bch_opt_id id;
+ int id;
u64 v;
id = bch2_parse_sysfs_opt(attr->name, buf, &v);
dst += bv.bv_len;
}
}
+
+size_t bch_scnmemcpy(char *buf, size_t size, const char *src, size_t len)
+{
+ size_t n;
+
+ if (!size)
+ return 0;
+
+ n = min(size - 1, len);
+ memcpy(buf, src, n);
+ buf[n] = '\0';
+
+ return n;
+}
static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
{
return size < PAGE_SIZE ? kmalloc(size, gfp_mask)
- : (void *) __get_free_pages(gfp_mask, get_order(size))
+ : (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+ get_order(size))
?: __vmalloc(size, gfp_mask, PAGE_KERNEL);
}
#define bio_for_each_contig_segment(bv, bio, iter) \
__bio_for_each_contig_segment(bv, bio, iter, (bio)->bi_iter)
+size_t bch_scnmemcpy(char *, size_t, const char *, size_t);
+
#endif /* _BCACHE_UTIL_H */
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
+static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
+{
+ return DIV_ROUND_UP(sizeof(struct bch_xattr) +
+ name_len + val_len, sizeof(u64));
+}
+
+#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
+
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
+
struct xattr_search_key {
u8 type;
struct qstr name;
return bch2_str_hash_end(&ctx, info);
}
-#define xattr_val(_xattr) ((_xattr)->x_name + (_xattr)->x_name_len)
-
static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
{
return bch2_xattr_hash(info, key);
memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
}
-static const struct bch_hash_desc xattr_hash_desc = {
+const struct bch_hash_desc bch2_xattr_hash_desc = {
.btree_id = BTREE_ID_XATTRS,
.key_type = BCH_XATTR,
.whiteout_type = BCH_XATTR_WHITEOUT,
static const char *bch2_xattr_invalid(const struct bch_fs *c,
struct bkey_s_c k)
{
+ const struct xattr_handler *handler;
+ struct bkey_s_c_xattr xattr;
+ unsigned u64s;
+
switch (k.k->type) {
case BCH_XATTR:
- return bkey_val_bytes(k.k) < sizeof(struct bch_xattr)
- ? "value too small"
- : NULL;
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
+ return "value too small";
+ xattr = bkey_s_c_to_xattr(k);
+ u64s = xattr_val_u64s(xattr.v->x_name_len,
+ le16_to_cpu(xattr.v->x_val_len));
+
+ if (bkey_val_u64s(k.k) < u64s)
+ return "value too small";
+
+ if (bkey_val_u64s(k.k) > u64s)
+ return "value too big";
+
+ handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+ if (!handler)
+ return "invalid type";
+
+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
+ return "xattr name has invalid characters";
+
+ return NULL;
case BCH_XATTR_WHITEOUT:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
+ const struct xattr_handler *handler;
struct bkey_s_c_xattr xattr;
- int n;
+ size_t n = 0;
switch (k.k->type) {
case BCH_XATTR:
xattr = bkey_s_c_to_xattr(k);
- if (size) {
- n = min_t(unsigned, size, xattr.v->x_name_len);
- memcpy(buf, xattr.v->x_name, n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
- n = scnprintf(buf, size, " -> ");
- buf += n;
- size -= n;
-
- if (size) {
- n = min_t(unsigned, size,
- le16_to_cpu(xattr.v->x_val_len));
- memcpy(buf, xattr_val(xattr.v), n);
- buf[size - 1] = '\0';
- buf += n;
- size -= n;
- }
-
+ handler = bch2_xattr_type_to_handler(xattr.v->x_type);
+ if (handler && handler->prefix)
+ n += scnprintf(buf + n, size - n, "%s", handler->prefix);
+ else if (handler)
+ n += scnprintf(buf + n, size - n, "(type %u)",
+ xattr.v->x_type);
+ else
+ n += scnprintf(buf + n, size - n, "(unknown type %u)",
+ xattr.v->x_type);
+
+ n += bch_scnmemcpy(buf + n, size - n, xattr.v->x_name,
+ xattr.v->x_name_len);
+ n += scnprintf(buf + n, size - n, ":");
+ n += bch_scnmemcpy(buf + n, size - n, xattr_val(xattr.v),
+ le16_to_cpu(xattr.v->x_val_len));
break;
case BCH_XATTR_WHITEOUT:
scnprintf(buf, size, "whiteout");
struct bkey_s_c_xattr xattr;
int ret;
- k = bch2_hash_lookup(xattr_hash_desc, &ei->str_hash, c,
+ k = bch2_hash_lookup(bch2_xattr_hash_desc, &ei->str_hash, c,
ei->vfs_inode.i_ino, &iter,
&X_SEARCH(type, name, strlen(name)));
if (IS_ERR(k.k))
int ret;
if (!value) {
- ret = bch2_hash_delete(xattr_hash_desc, hash_info,
+ ret = bch2_hash_delete(bch2_xattr_hash_desc, hash_info,
c, inum,
journal_seq, &search);
} else {
struct bkey_i_xattr *xattr;
unsigned u64s = BKEY_U64s +
- DIV_ROUND_UP(sizeof(struct bch_xattr) +
- search.name.len + size,
- sizeof(u64));
+ xattr_val_u64s(search.name.len, size);
if (u64s > U8_MAX)
return -ERANGE;
memcpy(xattr->v.x_name, search.name.name, search.name.len);
memcpy(xattr_val(&xattr->v), value, size);
- ret = bch2_hash_set(xattr_hash_desc, hash_info, c,
+ ret = bch2_hash_set(bch2_xattr_hash_desc, hash_info, c,
inum, journal_seq,
&xattr->k_i,
(flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
&ei->journal_seq);
}
-static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
-
static size_t bch2_xattr_emit(struct dentry *dentry,
const struct bch_xattr *xattr,
char *buffer, size_t buffer_size)
#ifndef _BCACHE_XATTR_H
#define _BCACHE_XATTR_H
+#include "str_hash.h"
+
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
extern const struct bkey_ops bch2_bkey_xattr_ops;
struct dentry;
#include <linux/kernel.h>
#include <linux/export.h>
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter dst_iter,
- struct bio *src, struct bvec_iter src_iter)
+void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+ struct bio *src, struct bvec_iter *src_iter)
{
struct bio_vec src_bv, dst_bv;
void *src_p, *dst_p;
unsigned bytes;
- while (1) {
- if (!src_iter.bi_size) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_iter = src->bi_iter;
- }
-
- if (!dst_iter.bi_size) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_iter = dst->bi_iter;
- }
-
- src_bv = bio_iter_iovec(src, src_iter);
- dst_bv = bio_iter_iovec(dst, dst_iter);
+ while (src_iter->bi_size && dst_iter->bi_size) {
+ src_bv = bio_iter_iovec(src, *src_iter);
+ dst_bv = bio_iter_iovec(dst, *dst_iter);
bytes = min(src_bv.bv_len, dst_bv.bv_len);
kunmap_atomic(dst_p);
kunmap_atomic(src_p);
- bio_advance_iter(src, &src_iter, bytes);
- bio_advance_iter(dst, &dst_iter, bytes);
+ flush_dcache_page(dst_bv.bv_page);
+
+ bio_advance_iter(src, src_iter, bytes);
+ bio_advance_iter(dst, dst_iter, bytes);
}
}
+/**
+ * bio_copy_data - copy contents of data buffers from one bio to another
+ * @src: source bio
+ * @dst: destination bio
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
void bio_copy_data(struct bio *dst, struct bio *src)
{
- bio_copy_data_iter(dst, dst->bi_iter,
- src, src->bi_iter);
+ struct bvec_iter src_iter = src->bi_iter;
+ struct bvec_iter dst_iter = dst->bi_iter;
+
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
}
void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)