From: Kent Overstreet Date: Mon, 31 May 2021 19:05:33 +0000 (-0400) Subject: Update bcachefs sources to 3913e0cac3 bcachefs: Journal space calculation fix X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=b61ad35b974038fd1b0396c51a61d84891ae0523;p=bcachefs-tools-debian Update bcachefs sources to 3913e0cac3 bcachefs: Journal space calculation fix --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 8da505a..93876ae 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ac3ab6a511717db1644ded49a6f417304abba048 +3913e0cac34e0993ab6dde67a2dec1ea485a2e28 diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index c79338c..7c90ba0 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -49,14 +49,14 @@ DECLARE_EVENT_CLASS(bch_fs, TP_ARGS(c), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; ), - TP_printk("%pU", __entry->uuid) + TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); DECLARE_EVENT_CLASS(bio, @@ -131,7 +131,7 @@ TRACE_EVENT(journal_reclaim_start, btree_key_cache_dirty, btree_key_cache_total), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) __field(u64, min_nr ) __field(u64, prereserved ) __field(u64, prereserved_total ) @@ -142,7 +142,7 @@ TRACE_EVENT(journal_reclaim_start, ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; __entry->min_nr = min_nr; __entry->prereserved = prereserved; __entry->prereserved_total = prereserved_total; @@ -152,8 +152,8 @@ TRACE_EVENT(journal_reclaim_start, __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", - __entry->uuid, + TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->min_nr, __entry->prereserved, __entry->prereserved_total, @@ -168,16 +168,18 @@ TRACE_EVENT(journal_reclaim_finish, TP_ARGS(c, nr_flushed), TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, nr_flushed ) + __field(dev_t, dev ) + __field(u64, nr_flushed ) ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->nr_flushed = nr_flushed; + __entry->dev = c->dev; + __entry->nr_flushed = nr_flushed; ), - TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) + TP_printk("%d%d flushed %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_flushed) ); /* bset.c: */ @@ -194,7 +196,7 @@ DECLARE_EVENT_CLASS(btree_node, TP_ARGS(c, b), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) __field(u8, level ) __field(u8, id ) __field(u64, inode ) @@ -202,15 +204,16 @@ DECLARE_EVENT_CLASS(btree_node, ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; __entry->level = b->c.level; __entry->id = b->c.btree_id; __entry->inode = b->key.k.p.inode; __entry->offset = b->key.k.p.offset; ), - TP_printk("%pU %u id %u %llu:%llu", - __entry->uuid, __entry->level, __entry->id, + TP_printk("%d,%d %u id %u %llu:%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, __entry->id, __entry->inode, __entry->offset) ); @@ -254,32 +257,17 @@ DEFINE_EVENT(btree_node, btree_node_reap, TP_ARGS(c, b) ); -DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - ), - - TP_printk("%pU", __entry->uuid) -); - -DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, +DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, +DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, +DEFINE_EVENT(bch_fs, btree_node_cannibalize, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); @@ -294,18 +282,19 @@ TRACE_EVENT(btree_reserve_get_fail, TP_ARGS(c, required, cl), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) __field(size_t, required ) __field(struct closure *, cl ) ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; __entry->required = required; __entry->cl = cl; ), - TP_printk("%pU required %zu by %p", __entry->uuid, + TP_printk("%d,%d required %zu by %p", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->required, __entry->cl) ); @@ -483,19 +472,20 @@ TRACE_EVENT(move_data, TP_ARGS(c, sectors_moved, keys_moved), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) __field(u64, sectors_moved ) __field(u64, keys_moved ) ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; __entry->sectors_moved = sectors_moved; __entry->keys_moved = keys_moved; ), - TP_printk("%pU sectors_moved %llu keys_moved %llu", - __entry->uuid, __entry->sectors_moved, __entry->keys_moved) + TP_printk("%d,%d sectors_moved %llu keys_moved %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->sectors_moved, __entry->keys_moved) ); TRACE_EVENT(copygc, @@ -507,7 +497,7 @@ TRACE_EVENT(copygc, buckets_moved, buckets_not_moved), TP_STRUCT__entry( - __array(char, uuid, 16 ) + __field(dev_t, dev ) __field(u64, sectors_moved ) __field(u64, sectors_not_moved ) __field(u64, buckets_moved ) @@ -515,17 +505,39 @@ TRACE_EVENT(copygc, ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); + __entry->dev = c->dev; __entry->sectors_moved = sectors_moved; __entry->sectors_not_moved = sectors_not_moved; __entry->buckets_moved = buckets_moved; __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", - __entry->uuid, - __entry->sectors_moved, __entry->sectors_not_moved, - __entry->buckets_moved, __entry->buckets_not_moved) + TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->sectors_moved, __entry->sectors_not_moved, + __entry->buckets_moved, __entry->buckets_not_moved) +); + +TRACE_EVENT(copygc_wait, + TP_PROTO(struct bch_fs *c, + u64 wait_amount, u64 until), + TP_ARGS(c, wait_amount, until), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, wait_amount ) + __field(u64, until ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->wait_amount = wait_amount; + __entry->until = until; + ), + + TP_printk("%d,%u waiting for %llu sectors until %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait_amount, __entry->until) ); TRACE_EVENT(trans_get_iter, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 24aa2cc..8be95d8 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -263,7 +263,10 @@ do { \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ - "done in memory") + "done in memory") \ + BCH_DEBUG_PARAM(verify_all_btree_replicas, \ + "When reading btree nodes, read all replicas and " \ + "compare them") /* Parameters that should only be compiled in in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() \ @@ -387,6 +390,14 @@ struct gc_pos { unsigned level; }; +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + struct io_count { u64 sectors[2][BCH_DATA_NR]; }; @@ -564,6 +575,7 @@ struct bch_fs { int minor; struct device *chardev; struct super_block *vfs_sb; + dev_t dev; char name[40]; /* ro/rw, add/remove/resize devices: */ @@ -623,6 +635,7 @@ struct bch_fs { /* BTREE CACHE */ struct bio_set btree_bio; + struct workqueue_struct *io_complete_wq; struct btree_root btree_roots[BTREE_ID_NR]; struct mutex btree_root_lock; @@ -660,7 +673,8 @@ struct bch_fs { struct btree_key_cache btree_key_cache; - struct workqueue_struct *wq; + struct workqueue_struct *btree_update_wq; + struct workqueue_struct *btree_error_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; @@ -799,6 +813,9 @@ struct bch_fs { /* REFLINK */ u64 reflink_hint; + reflink_gc_table reflink_gc_table; + size_t reflink_gc_nr; + size_t reflink_gc_idx; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index d640a31..79c0876 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1344,6 +1344,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); +LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); /* * Features: diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 26203a5..8a149e2 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -1193,13 +1193,11 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, static inline void prefetch_four_cachelines(void *p) { -#if (CONFIG_X86_64 && !defined(__clang__)) - asm(".intel_syntax noprefix;" - "prefetcht0 [%0 - 127 + 64 * 0];" - "prefetcht0 [%0 - 127 + 64 * 1];" - "prefetcht0 [%0 - 127 + 64 * 2];" - "prefetcht0 [%0 - 127 + 64 * 3];" - ".att_syntax prefix;" +#if CONFIG_X86_64 + asm("prefetcht0 (-127 + 64 * 0)(%0);" + "prefetcht0 (-127 + 64 * 1)(%0);" + "prefetcht0 (-127 + 64 * 2)(%0);" + "prefetcht0 (-127 + 64 * 3)(%0);" : : "r" (p + 127)); #else diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index e28292e..b03432c 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -23,6 +23,7 @@ #include "keylist.h" #include "move.h" #include "recovery.h" +#include "reflink.h" #include "replicas.h" #include "super-io.h" @@ -1282,6 +1283,201 @@ static int bch2_gc_start(struct bch_fs *c, return 0; } +static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k) +{ + struct reflink_gc *r; + const __le64 *refcount = bkey_refcount_c(k); + char buf[200]; + int ret = 0; + + if (!refcount) + return 0; + + r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); + if (!r) + return -ENOMEM; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + r->refcount)) { + struct bkey_i *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto fsck_err; + } + + bkey_reassemble(new, k); + + if (!r->refcount) { + new->k.type = KEY_TYPE_deleted; + new->k.size = 0; + } else { + *bkey_refcount(new) = cpu_to_le64(r->refcount); + } + + ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); + if (ret) + kfree(new); + } +fsck_err: + return ret; +} + +static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct reflink_gc *r; + size_t idx = 0; + char buf[200]; + int ret = 0; + + if (metadata_only) + return 0; + + if (initial) { + c->reflink_gc_idx = 0; + + ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink, + bch2_gc_reflink_done_initial_fn); + goto out; + } + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr(&c->reflink_gc_table, idx); + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + ret = -EINVAL; + break; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + r->refcount)) { + struct bkey_i *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + break; + } + + bkey_reassemble(new, k); + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(new) = cpu_to_le64(r->refcount); + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); + kfree(new); + + if (ret) + break; + } + } +fsck_err: + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); +out: + genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + return ret; +} + +static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k) +{ + + struct reflink_gc *r; + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + return 0; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + return 0; +} + +static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret; + + if (metadata_only) + return 0; + + genradix_free(&c->reflink_gc_table); + c->reflink_gc_nr = 0; + + if (initial) + return bch2_btree_and_journal_walk(c, BTREE_ID_reflink, + bch2_gc_reflink_start_initial_fn); + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -ENOMEM; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + } + bch2_trans_iter_put(&trans, iter); + + bch2_trans_exit(&trans); + return 0; +} + /** * bch2_gc - walk _all_ references to buckets, and recompute them: * @@ -1316,7 +1512,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); again: - ret = bch2_gc_start(c, metadata_only); + ret = bch2_gc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; @@ -1378,7 +1575,8 @@ out: bch2_journal_block(&c->journal); percpu_down_write(&c->mark_lock); - ret = bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); } else { diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 094285b..47cfd8a 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -521,7 +521,7 @@ enum btree_validate_ret { \ switch (write) { \ case READ: \ - bch_err(c, "%s", _buf2); \ + bch_err(c, "%s", _buf2); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -815,6 +815,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; + unsigned nonblacklisted_written = 0; int ret, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; @@ -934,15 +935,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sort_iter_add(iter, vstruct_idx(i, whiteout_u64s), vstruct_last(i)); + + nonblacklisted_written = b->written; } for (bne = write_block(b); bset_byte_offset(b, bne) < btree_bytes(c); bne = (void *) bne + block_bytes(c)) - btree_err_on(bne->keys.seq == b->data->keys.seq, + btree_err_on(bne->keys.seq == b->data->keys.seq && + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), BTREE_ERR_WANT_RETRY, c, ca, b, NULL, "found bset signature after last bset"); + /* + * Blacklisted bsets are those that were written after the most recent + * (flush) journal write. Since there wasn't a flush, they may not have + * made it to all devices - which means we shouldn't write new bsets + * after them, as that could leave a gap and then reads from that device + * wouldn't find all the bsets in that btree node - which means it's + * important that we start writing new bsets after the most recent _non_ + * blacklisted bset: + */ + b->written = nonblacklisted_written; + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); sorted->keys.u64s = 0; @@ -1027,8 +1044,8 @@ static void btree_node_read_work(struct work_struct *work) struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); struct bch_fs *c = rb->c; + struct btree *b = rb->b; struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); - struct btree *b = rb->bio.bi_private; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; char buf[200]; @@ -1101,7 +1118,263 @@ static void btree_node_read_endio(struct bio *bio) bch2_latency_acct(ca, rb->start_time, READ); } - queue_work(system_unbound_wq, &rb->work); + queue_work(c->io_complete_wq, &rb->work); +} + +struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; + struct btree *b; + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; + int err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + unsigned offset = 0; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + + while (offset < c->opts.btree_node_size) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { + bne = data + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + offset += vstruct_sectors(bne, c->block_bits); + } + } + + return offset; +} + +static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + + if (!offset) + return false; + + while (offset < c->opts.btree_node_size) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; + offset++; + } + + return false; + return offset; +} + +static void btree_node_read_all_replicas_done(struct closure *cl) +{ + struct btree_node_read_all *ra = + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; + bool have_good_copy = false; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, write = READ; + unsigned i, written, written2; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + + for (i = 0; i < ra->nr; i++) { + if (ra->err[i]) + continue; + + if (!have_good_copy) { + memcpy(b->data, ra->buf[i], btree_bytes(c)); + have_good_copy = true; + written = btree_node_sectors_written(c, b->data); + } + + /* Try to get the right btree node: */ + if (have_good_copy && + seq && + b->data->keys.seq != seq && + ((struct btree_node *) ra->buf[i])->keys.seq == seq) { + memcpy(b->data, ra->buf[i], btree_bytes(c)); + written = btree_node_sectors_written(c, b->data); + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); + if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "found bset signature after last bset") || + btree_err_on(memcmp(b->data, ra->buf[i], written << 9), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; + memcpy(b->data, ra->buf[i], btree_bytes(c)); + } + } +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { + char buf[200]; + struct printbuf out = PBUF(buf); + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; + bool gap = false; + + if (ra->err[i]) + continue; + + while (offset < c->opts.btree_node_size) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + sectors = vstruct_sectors(bne, c->block_bits); + } + + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + offset += sectors; + } + + while (offset < c->opts.btree_node_size) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) + pr_buf(&out, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + } + offset++; + } + + bch_err(c, "replica %u:%s", i, buf); + } + } + + if (have_good_copy) + bch2_btree_node_read_done(c, NULL, b, false); + else + set_btree_node_read_error(b); + + for (i = 0; i < ra->nr; i++) { + mempool_free(ra->buf[i], &c->btree_bounce_pool); + bio_put(ra->bio[i]); + } + + closure_debug_destroy(&ra->cl); + kfree(ra); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_all_replicas_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + struct btree_node_read_all *ra = rb->ra; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time, READ); + } + + ra->err[rb->idx] = bio->bi_status; + closure_put(&ra->cl); +} + +/* + * XXX This allocates multiple times from the same mempools, and can deadlock + * under sufficient memory pressure (but is only a debug path) + */ +static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) +{ + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded pick; + struct btree_node_read_all *ra; + unsigned i; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + closure_init(&ra->cl, NULL); + ra->c = c; + ra->b = b; + ra->nr = bch2_bkey_nr_ptrs(k); + + for (i = 0; i < ra->nr; i++) { + ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i], + btree_bytes(c)), + &c->btree_bio); + } + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = ra; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->idx = i; + rb->pick = pick; + rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + rb->bio.bi_iter.bi_sector = pick.ptr.offset; + rb->bio.bi_end_io = btree_node_read_all_replicas_endio; + bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(&rb->bio)); + bio_set_dev(&rb->bio, ca->disk_sb.bdev); + + closure_get(&ra->cl); + submit_bio(&rb->bio); + } else { + ra->err[i] = BLK_STS_REMOVED; + } + + i++; + } + + if (sync) { + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl); + } else { + continue_at(&ra->cl, btree_node_read_all_replicas_done, + c->io_complete_wq); + } + + return 0; } void bch2_btree_node_read(struct bch_fs *c, struct btree *b, @@ -1117,6 +1390,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); + set_btree_node_read_in_flight(b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick); if (bch2_fs_fatal_err_on(ret <= 0, c, @@ -1133,6 +1412,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; + rb->b = b; + rb->ra = NULL; rb->start_time = local_clock(); rb->have_ioref = bch2_dev_get_ioref(ca, READ); rb->pick = pick; @@ -1140,11 +1421,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; bio->bi_end_io = btree_node_read_endio; - bio->bi_private = b; bch2_bio_map(bio, b->data, btree_bytes(c)); - set_btree_node_read_in_flight(b); - if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -1153,7 +1431,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - bio->bi_private = b; btree_node_read_work(&rb->work); } else { submit_bio(bio); @@ -1164,8 +1441,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) btree_node_read_work(&rb->work); else - queue_work(system_unbound_wq, &rb->work); - + queue_work(c->io_complete_wq, &rb->work); } } @@ -1332,7 +1608,7 @@ static void btree_node_write_work(struct work_struct *work) bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); - queue_work(c->wq, &c->btree_write_error_work); + queue_work(c->btree_error_wq, &c->btree_write_error_work); return; } @@ -1371,7 +1647,7 @@ static void btree_node_write_endio(struct bio *bio) container_of(orig, struct btree_write_bio, wbio); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(system_unbound_wq, &wb->work); + queue_work(c->io_complete_wq, &wb->work); } } @@ -1441,6 +1717,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) return; if (old & (1 << BTREE_NODE_write_in_flight)) { + /* + * XXX waiting on btree writes with btree locks held - + * this can deadlock, and we hit the write error path + */ btree_node_wait_on_io(b); continue; } @@ -1631,7 +1911,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) atomic64_add(sectors_to_write, &c->btree_writes_sectors); INIT_WORK(&wbio->work, btree_write_submit); - schedule_work(&wbio->work); + queue_work(c->io_complete_wq, &wbio->work); return; err: set_btree_node_noevict(b); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index cadcf7f..abbc467 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -13,6 +13,7 @@ struct bch_fs; struct btree_write; struct btree; struct btree_iter; +struct btree_node_read_all; static inline bool btree_node_dirty(struct btree *b) { @@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) struct btree_read_bio { struct bch_fs *c; + struct btree *b; + struct btree_node_read_all *ra; u64 start_time; unsigned have_ioref:1; + unsigned idx:7; struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 7f86a39..bdb068e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2260,6 +2260,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned expected_nr_iters, size_t expected_mem_bytes) + __acquires(&c->btree_trans_barrier) { memset(trans, 0, sizeof(*trans)); trans->c = c; @@ -2292,6 +2293,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, } int bch2_trans_exit(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) { struct bch_fs *c = trans->c; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index bee7ee6..b0484c7 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as) BUG_ON(!journal_pin_active(&as->journal)); + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: + */ + for (i = 0; i < as->nr_old_nodes; i++) { + struct btree_node *bn = READ_ONCE(as->old_nodes[i]->data); + + /* + * This is technically a use after free, but it's just a read - + * but it might cause problems in userspace where freeing the + * buffer may unmap it: + */ + if (bn && bn->keys.seq == as->old_nodes_seq[i]) + btree_node_wait_on_io(as->old_nodes[i]); + } + /* * We did an update to a parent node where the pointers we added pointed * to child nodes that weren't written yet: now, the child nodes have @@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, btree_update_will_delete_key(as, &b->key); - /* - * XXX: Waiting on io with btree node locks held, we don't want to be - * doing this. We can't have btree writes happening after the space has - * been freed, but we really only need to block before - * btree_update_nodes_written_trans() happens. - */ - btree_node_wait_on_io(b); + as->old_nodes[as->nr_old_nodes] = b; + as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; + as->nr_old_nodes++; } void bch2_btree_update_done(struct btree_update *as) @@ -908,7 +920,8 @@ void bch2_btree_update_done(struct btree_update *as) bch2_btree_reserve_put(as); - continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); + continue_at(&as->cl, btree_update_set_nodes_written, + as->c->btree_interior_update_worker); } struct btree_update * @@ -1826,7 +1839,10 @@ void async_btree_node_rewrite_work(struct work_struct *work) void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { - struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); + struct async_btree_rewrite *a; + + if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + return; if (!percpu_ref_tryget(&c->writes)) return; @@ -1844,7 +1860,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - queue_work(system_long_wq, &a->work); + queue_work(c->btree_interior_update_worker, &a->work); } static void __bch2_btree_node_update_key(struct bch_fs *c, diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 7eef3db..7ed67b4 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -92,6 +92,10 @@ struct btree_update { struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; unsigned nr_new_nodes; + struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; + __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; + unsigned nr_old_nodes; + open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * BCH_REPLICAS_MAX]; open_bucket_idx_t nr_open_buckets; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index cbd295e..d07085a 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -14,6 +14,7 @@ #include "ec.h" #include "error.h" #include "movinggc.h" +#include "reflink.h" #include "replicas.h" #include @@ -1072,6 +1073,124 @@ static int bch2_mark_stripe(struct bch_fs *c, return 0; } +static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p, + u64 p_start, u64 p_end, + u64 v_start, u64 v_end) +{ + if (p_start == p_end) + return false; + + p_start += le64_to_cpu(p.v->idx); + p_end += le64_to_cpu(p.v->idx); + + if (p_end <= v_start) + return false; + if (p_start >= v_end) + return false; + return true; +} + +static int reflink_p_frag_references(struct bkey_s_c_reflink_p p, + u64 start, u64 end, + struct bkey_s_c k) +{ + return __reflink_p_frag_references(p, start, end, + bkey_start_offset(k.k), + k.k->p.offset); +} + +static int __bch2_mark_reflink_p(struct bch_fs *c, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, + unsigned front_frag, + unsigned back_frag, + unsigned flags, + size_t *r_idx) +{ + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int frags_referenced; + + while (1) { + if (*r_idx >= c->reflink_gc_nr) + goto not_found; + r = genradix_ptr(&c->reflink_gc_table, *r_idx); + BUG_ON(!r); + + if (r->offset > idx) + break; + (*r_idx)++; + } + + frags_referenced = + __reflink_p_frag_references(p, 0, front_frag, + r->offset - r->size, r->offset) + + __reflink_p_frag_references(p, back_frag, p.k->size, + r->offset - r->size, r->offset); + + if (frags_referenced == 2) { + BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); + add = -add; + } else if (frags_referenced == 1) { + BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); + add = 0; + } + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + return min_t(u64, sectors, r->offset - idx); +not_found: + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + bch2_inconsistent_error(c); + return -EIO; +} + +static int bch2_mark_reflink_p(struct bch_fs *c, + struct bkey_s_c_reflink_p p, unsigned offset, + s64 sectors, unsigned flags) +{ + u64 idx = le64_to_cpu(p.v->idx) + offset; + struct reflink_gc *ref; + size_t l, r, m; + unsigned front_frag, back_frag; + s64 ret = 0; + + if (sectors < 0) + sectors = -sectors; + + BUG_ON(offset + sectors > p.k->size); + + front_frag = offset; + back_frag = offset + sectors; + + l = 0; + r = c->reflink_gc_nr; + while (l < r) { + m = l + (r - l) / 2; + + ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (sectors) { + ret = __bch2_mark_reflink_p(c, p, idx, sectors, + front_frag, back_frag, flags, &l); + if (ret < 0) + return ret; + + idx += ret; + sectors -= ret; + } + + return 0; +} + static int bch2_mark_key_locked(struct bch_fs *c, struct bkey_s_c old, struct bkey_s_c new, @@ -1127,6 +1246,10 @@ static int bch2_mark_key_locked(struct bch_fs *c, fs_usage->persistent_reserved[replicas - 1] += sectors; break; } + case KEY_TYPE_reflink_p: + ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k), + offset, sectors, flags); + break; } preempt_enable(); @@ -1689,35 +1812,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -static __le64 *bkey_refcount(struct bkey_i *k) -{ - switch (k->k.type) { - case KEY_TYPE_reflink_v: - return &bkey_i_to_reflink_v(k)->v.refcount; - case KEY_TYPE_indirect_inline_data: - return &bkey_i_to_indirect_inline_data(k)->v.refcount; - default: - return NULL; - } -} - -static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p, - u64 start, u64 end, - struct bkey_s_c k) -{ - if (start == end) - return false; - - start += le64_to_cpu(p.v->idx); - end += le64_to_cpu(p.v->idx); - - if (end <= bkey_start_offset(k.k)) - return false; - if (start >= k.k->p.offset) - return false; - return true; -} - static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 idx, unsigned sectors, @@ -1731,6 +1825,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + int frags_referenced; s64 ret; ret = trans_get_key(trans, BTREE_ID_reflink, @@ -1738,18 +1833,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, if (ret < 0) return ret; - if (reflink_p_frag_references(p, 0, front_frag, k) && - reflink_p_frag_references(p, back_frag, p.k->size, k)) { + sectors = min_t(u64, sectors, k.k->p.offset - idx); + + frags_referenced = + reflink_p_frag_references(p, 0, front_frag, k) + + reflink_p_frag_references(p, back_frag, p.k->size, k); + + if (frags_referenced == 2) { BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT)); add = -add; - } else if (reflink_p_frag_references(p, 0, front_frag, k) || - reflink_p_frag_references(p, back_frag, p.k->size, k)) { + } else if (frags_referenced == 1) { BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE)); goto out; } - sectors = min_t(u64, sectors, k.k->p.offset - idx); - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -1804,14 +1901,13 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, front_frag, back_frag, flags); if (ret < 0) - break; + return ret; - idx += ret; - sectors = max_t(s64, 0LL, sectors - ret); - ret = 0; + idx += ret; + sectors -= ret; } - return ret; + return 0; } int bch2_trans_mark_key(struct btree_trans *trans, diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 08c6af8..00a63fe 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -23,6 +23,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, struct btree_iter *inode_iter = NULL; struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); u64 dir_offset = 0; int ret; @@ -36,7 +37,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - inode_iter = bch2_inode_create(trans, new_inode, U32_MAX); + inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu); ret = PTR_ERR_OR_ZERO(inode_iter); if (ret) goto err; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index eb87163..d8cc32e 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -13,6 +13,9 @@ #include #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) +#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ struct flags_set { unsigned mask; @@ -247,11 +250,54 @@ err1: return ret; } +static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) +{ + u32 flags; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, arg)) + return -EFAULT; + + bch_notice(c, "shutdown by ioctl type %u", flags); + + down_write(&c->vfs_sb->s_umount); + + switch (flags) { + case FSOP_GOING_FLAGS_DEFAULT: + ret = freeze_bdev(c->vfs_sb->s_bdev); + if (ret) + goto err; + + bch2_journal_flush(&c->journal); + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + thaw_bdev(c->vfs_sb->s_bdev); + break; + + case FSOP_GOING_FLAGS_LOGFLUSH: + bch2_journal_flush(&c->journal); + fallthrough; + + case FSOP_GOING_FLAGS_NOLOGFLUSH: + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + break; + default: + ret = -EINVAL; + break; + } +err: + up_write(&c->vfs_sb->s_umount); + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); - struct super_block *sb = inode->v.i_sb; - struct bch_fs *c = sb->s_fs_info; + struct bch_fs *c = inode->v.i_sb->s_fs_info; switch (cmd) { case FS_IOC_GETFLAGS: @@ -276,15 +322,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) return -ENOTTY; case FS_IOC_GOINGDOWN: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - down_write(&sb->s_umount); - sb->s_flags |= SB_RDONLY; - if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only due to ioctl"); - up_write(&sb->s_umount); - return 0; + return bch2_ioc_goingdown(c, (u32 __user *) arg); default: return bch2_fs_ioctl(c, cmd, (void __user *) arg); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 25a9fc1..e8a329c 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1578,6 +1578,8 @@ got_sb: break; } + c->dev = sb->s_dev; + #ifdef CONFIG_BCACHEFS_POSIX_ACL if (c->opts.acl) sb->s_flags |= SB_POSIXACL; diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index c5892e4..6b43a97 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -472,23 +472,28 @@ static inline u32 bkey_generation(struct bkey_s_c k) struct btree_iter *bch2_inode_create(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, - u32 snapshot) + u32 snapshot, u64 cpu) { struct bch_fs *c = trans->c; struct btree_iter *iter = NULL; struct bkey_s_c k; u64 min, max, start, pos, *hint; int ret = 0; + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); - u64 cpu = raw_smp_processor_id(); - unsigned bits = (c->opts.inodes_32bit - ? 31 : 63) - c->inode_shard_bits; + if (c->opts.shard_inode_numbers) { + bits -= c->inode_shard_bits; - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; + } else { + min = BLOCKDEV_INODE_MAX; + max = ~(ULLONG_MAX << bits); + hint = c->unused_inode_hints; + } start = READ_ONCE(*hint); diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 558d546..2cb081a 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, struct bch_inode_unpacked *); struct btree_iter *bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, u32); + struct bch_inode_unpacked *, u32, u64); int bch2_inode_rm(struct bch_fs *, u64, bool); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 9b6aece..157b2a0 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1439,7 +1439,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_migrate_read_done(&op->write, rbio); closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); closure_return_with_destructor(cl, promote_done); } @@ -1822,6 +1822,13 @@ static void __bch2_read_endio(struct work_struct *work) if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) goto csum_err; + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 144dc93..bc0a0bd 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -58,7 +58,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_MOVINGGC ? op->c->copygc_wq - : op->c->wq; + : op->c->btree_update_wq; } int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 52efa46..af5386d 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -118,7 +118,9 @@ void bch2_journal_halt(struct journal *j) void __bch2_journal_buf_put(struct journal *j) { - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } /* @@ -304,7 +306,7 @@ static int journal_entry_open(struct journal *j) j->res_get_blocked_start); j->res_get_blocked_start = 0; - mod_delayed_work(system_freezable_wq, + mod_delayed_work(c->io_complete_wq, &j->write_work, msecs_to_jiffies(j->write_delay_ms)); journal_wake(j); @@ -805,10 +807,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, long b; if (new_fs) { - percpu_down_read(&c->mark_lock); b = bch2_bucket_alloc_new_fs(ca); if (b < 0) { - percpu_up_read(&c->mark_lock); ret = -ENOSPC; goto err; } @@ -825,7 +825,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, b = sector_to_bucket(ca, ob->ptr.offset); } - spin_lock(&c->journal.lock); + if (c) + spin_lock(&c->journal.lock); /* * XXX @@ -852,14 +853,14 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - spin_unlock(&c->journal.lock); + if (c) + spin_unlock(&c->journal.lock); if (new_fs) { bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); - percpu_up_read(&c->mark_lock); } else { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_mark_metadata_bucket(&trans, ca, diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 635cceb..2da6839 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -834,7 +834,7 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, unsigned i; for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = c->devs[j->ptrs[i].dev]; + struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); u64 offset; div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset); @@ -1233,8 +1233,6 @@ static void journal_write_done(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_devs_list devs = - bch2_bkey_devs(bkey_i_to_s_c(&w->key)); struct bch_replicas_padded replicas; union journal_res_state old, new; u64 v, seq; @@ -1242,11 +1240,12 @@ static void journal_write_done(struct closure *cl) bch2_time_stats_update(j->write_time, j->write_start_time); - if (!devs.nr) { + if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); err = -EIO; } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); if (bch2_mark_replicas(c, &replicas.e)) err = -EIO; } @@ -1258,7 +1257,7 @@ static void journal_write_done(struct closure *cl) seq = le64_to_cpu(w->data->seq); if (seq >= j->pin.front) - journal_seq_pin(j, seq)->devs = devs; + journal_seq_pin(j, seq)->devs = w->devs_written; j->seq_ondisk = seq; if (err && (!j->err_seq || seq < j->err_seq)) @@ -1296,27 +1295,27 @@ static void journal_write_done(struct closure *cl) journal_wake(j); if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) - mod_delayed_work(system_freezable_wq, &j->write_work, 0); + mod_delayed_work(c->io_complete_wq, &j->write_work, 0); spin_unlock(&j->lock); if (new.unwritten_idx != new.idx && !journal_state_count(new, new.unwritten_idx)) - closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); } static void journal_write_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; + struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned long flags; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s", + if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", + le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; - spin_lock_irqsave(&j->err_lock, flags); - bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); + bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } @@ -1370,7 +1369,7 @@ static void do_journal_write(struct closure *cl) le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, system_highpri_wq); + continue_at(cl, journal_write_done, c->io_complete_wq); return; } @@ -1402,7 +1401,8 @@ void bch2_journal_write(struct closure *cl) test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); - jset->last_seq = w->last_seq = 0; + jset->last_seq = 0; + w->last_seq = 0; j->nr_noflush_writes++; } else { @@ -1509,14 +1509,12 @@ retry_alloc: journal_debug_buf); kfree(journal_debug_buf); bch2_fatal_error(c); - continue_at(cl, journal_write_done, system_highpri_wq); + continue_at(cl, journal_write_done, c->io_complete_wq); return; } - /* - * XXX: we really should just disable the entire journal in nochanges - * mode - */ + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + if (c->opts.nochanges) goto no_io; @@ -1542,14 +1540,14 @@ retry_alloc: bch2_bucket_seq_cleanup(c); - continue_at(cl, do_journal_write, system_highpri_wq); + continue_at(cl, do_journal_write, c->io_complete_wq); return; no_io: bch2_bucket_seq_cleanup(c); - continue_at(cl, journal_write_done, system_highpri_wq); + continue_at(cl, journal_write_done, c->io_complete_wq); return; err: bch2_inconsistent_error(c); - continue_at(cl, journal_write_done, system_highpri_wq); + continue_at(cl, journal_write_done, c->io_complete_wq); } diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 427be2d..7a0ae5d 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -93,6 +93,10 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, * until we write it out - thus, account for it here: */ while ((unwritten = get_unwritten_sectors(j, &idx))) { + /* entry won't fit on this device, skip: */ + if (unwritten > ca->mi.bucket_size) + continue; + if (unwritten >= sectors) { if (!buckets) { sectors = 0; diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index e1b63f3..f2060f9 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -111,8 +111,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl->start[nr].start = cpu_to_le64(start); bl->start[nr].end = cpu_to_le64(end); out_write_sb: - c->disk_sb.sb->features[0] |= - 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); out: @@ -298,8 +297,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) BUG_ON(new_nr && !bl); if (!new_nr) - c->disk_sb.sb->features[0] &= - ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); bch2_write_super(c); } diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index cacab22..61674ae 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -21,6 +21,7 @@ struct journal_buf { struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); + struct bch_devs_list devs_written; struct closure_waitlist wait; u64 last_seq; /* copy of data->last_seq */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 778ff72..2fa763e 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -523,6 +523,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, if (ret) goto err; + if (!k.k || bkey_cmp(k.k->p, pos)) { + ret = -ENOENT; + goto err; + } + ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; if (ret) goto err; @@ -921,8 +926,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); c->disk_sb.sb->version_min = c->disk_sb.sb->version; bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 61c5901..2acca0d 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -317,6 +317,8 @@ static int bch2_copygc_thread(void *arg) set_freezable(); while (!kthread_should_stop()) { + cond_resched(); + if (kthread_wait_freezable(c->copy_gc_enabled)) break; @@ -324,6 +326,7 @@ static int bch2_copygc_thread(void *arg) wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { + trace_copygc_wait(c, wait, last + wait); c->copygc_wait = last + wait; bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 001e865..1e2fc5d 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -165,8 +165,13 @@ enum opt_type { x(inodes_32bit, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH_SB_INODE_32BIT, false, \ + BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ + x(shard_inode_numbers, u8, \ + OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, false, \ + NULL, "Shard new inode numbers by CPU id") \ x(gc_reserve_percent, u8, \ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index cd538ec..9bd6348 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -716,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_dev_usage: { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); - struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / sizeof(struct jset_entry_dev_usage_type); @@ -755,7 +755,7 @@ static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); - atomic64_set(&c->io_clock[clock->rw].now, clock->time); + atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); } } @@ -1217,13 +1217,13 @@ use_clean: mutex_lock(&c->sb_lock); if (c->opts.version_upgrade) { - c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); write_sb = true; } if (!test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } @@ -1278,12 +1278,12 @@ int bch2_fs_initialize(struct bch_fs *c) bch_notice(c, "initializing new filesystem"); mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); if (c->opts.version_upgrade) { - c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); bch2_write_super(c); } diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index c624fab..a420729 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); - refcount = (void *) &r_v->v; + refcount = bkey_refcount(r_v); *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); @@ -181,18 +181,19 @@ err: static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) { - struct bkey_s_c k = bch2_btree_iter_peek(iter); + struct bkey_s_c k; int ret; for_each_btree_key_continue(iter, 0, k, ret) { if (bkey_cmp(iter->pos, end) >= 0) - return bkey_s_c_null; + break; if (bkey_extent_is_data(k.k)) - break; + return k; } - return k; + bch2_btree_iter_set_pos(iter, end); + return bkey_s_c_null; } s64 bch2_remap_range(struct bch_fs *c, @@ -205,8 +206,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; struct bpos dst_end = dst_start, src_end = src_start; - struct bpos dst_want, src_want; - u64 src_done, dst_done; + struct bpos src_want; + u64 dst_done; int ret = 0, ret2 = 0; if (!percpu_ref_tryget(&c->writes)) @@ -226,7 +227,8 @@ s64 bch2_remap_range(struct bch_fs *c, dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, BTREE_ITER_INTENT); - while (ret == 0 || ret == -EINTR) { + while ((ret == 0 || ret == -EINTR) && + bkey_cmp(dst_iter->pos, dst_end) < 0) { struct disk_reservation disk_res = { 0 }; bch2_trans_begin(&trans); @@ -236,32 +238,29 @@ s64 bch2_remap_range(struct bch_fs *c, break; } + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); + src_k = get_next_src(src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; - src_done = bpos_min(src_iter->pos, src_end).offset - - src_start.offset; - dst_want = POS(dst_start.inode, dst_start.offset + src_done); - - if (bkey_cmp(dst_iter->pos, dst_want) < 0) { - ret = bch2_fpunch_at(&trans, dst_iter, dst_want, - journal_seq, i_sectors_delta); + if (bkey_cmp(src_want, src_iter->pos) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, + bpos_min(dst_end, + POS(dst_iter->pos.inode, dst_iter->pos.offset + + src_iter->pos.offset - src_want.offset)), + journal_seq, i_sectors_delta); continue; } - BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); - - if (!bkey_cmp(dst_iter->pos, dst_end)) - break; - if (src_k.k->type != KEY_TYPE_reflink_p) { bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); - bch2_cut_front(src_iter->pos, new_src.k); - bch2_cut_back(src_end, new_src.k); + bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k)); ret = bch2_make_extent_indirect(&trans, src_iter, new_src.k); @@ -278,7 +277,7 @@ s64 bch2_remap_range(struct bch_fs *c, bkey_reflink_p_init(new_dst.k); u64 offset = le64_to_cpu(src_p.v->idx) + - (src_iter->pos.offset - + (src_want.offset - bkey_start_offset(src_k.k)); dst_p->v.idx = cpu_to_le64(offset); @@ -288,20 +287,13 @@ s64 bch2_remap_range(struct bch_fs *c, new_dst.k->k.p = dst_iter->pos; bch2_key_resize(&new_dst.k->k, - min(src_k.k->p.offset - src_iter->pos.offset, + min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_iter, new_dst.k, &disk_res, journal_seq, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); - if (ret) - continue; - - dst_done = dst_iter->pos.offset - dst_start.offset; - src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(src_iter, src_want); } bch2_trans_iter_put(&trans, dst_iter); bch2_trans_iter_put(&trans, src_iter); diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 9d5e7dc..bfc7856 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *, .val_to_text = bch2_indirect_inline_data_to_text, \ } +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_reflink_v: + return &bkey_s_c_to_reflink_v(k).v->refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_s_c_to_indirect_inline_data(k).v->refcount; + default: + return NULL; + } +} + +static inline __le64 *bkey_refcount(struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_reflink_v: + return &bkey_i_to_reflink_v(k)->v.refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_i_to_indirect_inline_data(k)->v.refcount; + default: + return NULL; + } +} + s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, u64, u64 *, u64, s64 *); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 74a75ce..9778851 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -982,7 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); - c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS; + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -999,7 +999,7 @@ static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) * The u64s field counts from the start of data, ignoring the shared * fields. */ - entry->u64s = u64s - 1; + entry->u64s = cpu_to_le16(u64s - 1); *end = vstruct_next(*end); return entry; @@ -1092,7 +1092,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, clock->entry.type = BCH_JSET_ENTRY_clock; clock->rw = i; - clock->time = atomic64_read(&c->io_clock[i].now); + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); } } @@ -1109,10 +1109,10 @@ void bch2_fs_mark_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; - c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); - c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 3b1e920..4c67936 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -509,10 +509,14 @@ static void __bch2_fs_free(struct bch_fs *c) kfree(c->unused_inode_hints); free_heap(&c->copygc_heap); + if (c->io_complete_wq ) + destroy_workqueue(c->io_complete_wq ); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->wq) - destroy_workqueue(c->wq); + if (c->btree_error_wq) + destroy_workqueue(c->btree_error_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); kvpfree(c, sizeof(*c)); @@ -760,10 +764,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->wq = alloc_workqueue("bcachefs", + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_error_wq = alloc_workqueue("bcachefs_error", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || @@ -1437,7 +1445,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, /* Device add/removal: */ -int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) { struct btree_trans trans; size_t i; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 21ef771..84a7acb 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -312,7 +312,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c return 0; } -void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); bch2_bpos_to_text(out, c->gc_gens_pos);