From: Kent Overstreet Date: Mon, 2 May 2022 22:39:16 +0000 (-0400) Subject: Update bcachefs sources to bdf6d7c135 fixup! bcachefs: Kill journal buf bloom filter X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=6f5afc0c12bbf56ffdabe5b2c5297aef255c4baa;p=bcachefs-tools-debian Update bcachefs sources to bdf6d7c135 fixup! bcachefs: Kill journal buf bloom filter --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 20dd0bb..4fb144e 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -4c2d3669b15475674b750244bb1e096849352bc8 +bdf6d7c1350497bc7b0be6027a51d9330645672d diff --git a/cmd_debug.c b/cmd_debug.c index e29ceff..c2206da 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -108,7 +108,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data, max_t(unsigned, btree_bytes(c) / 8, block_bytes(c))); - darray_exit(data); + darray_exit(&data); } int cmd_dump(int argc, char *argv[]) diff --git a/cmd_format.c b/cmd_format.c index c3debe0..5c2bc8c 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -197,9 +197,9 @@ int cmd_format(int argc, char *argv[]) initialize = false; break; case O_no_opt: - darray_push(device_paths, optarg); + darray_push(&device_paths, optarg); dev_opts.path = optarg; - darray_push(devices, dev_opts); + darray_push(&devices, dev_opts); dev_opts.size = 0; break; case O_quiet: @@ -253,7 +253,7 @@ int cmd_format(int argc, char *argv[]) free(opts.passphrase); } - darray_exit(devices); + darray_exit(&devices); if (initialize) { struct bch_opts mount_opts = bch2_opts_empty(); @@ -275,7 +275,7 @@ int cmd_format(int argc, char *argv[]) bch2_fs_stop(c); } - darray_exit(device_paths); + darray_exit(&device_paths); return 0; } diff --git a/cmd_fs.c b/cmd_fs.c index 4e955ea..ee3ea65 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -267,7 +267,7 @@ static void fs_usage_to_text(struct printbuf *out, const char *path) free(dev->dev); free(dev->label); } - darray_exit(dev_names); + darray_exit(&dev_names); bcache_fs_close(fs); } diff --git a/cmd_migrate.c b/cmd_migrate.c index 08ec7de..b67fc02 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -603,7 +603,7 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, update_inode(c, &root_inode); - darray_exit(s.extents); + darray_exit(&s.extents); genradix_free(&s.hardlinks); } diff --git a/doc/bcachefs-principles-of-operation.tex b/doc/bcachefs-principles-of-operation.tex index d5ac6ed..89b39bf 100644 --- a/doc/bcachefs-principles-of-operation.tex +++ b/doc/bcachefs-principles-of-operation.tex @@ -530,6 +530,8 @@ passed as mount parameters the persistent options are unmodified. \subsection{File and directory options} + + Options set on inodes (files and directories) are automatically inherited by their descendants, and inodes also record whether a given option was explicitly set or inherited from their parent. When renaming a directory would cause diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 1ae5e88..b96b257 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -142,17 +142,21 @@ DEFINE_EVENT(bio, journal_write, ); TRACE_EVENT(journal_reclaim_start, - TP_PROTO(struct bch_fs *c, u64 min_nr, + TP_PROTO(struct bch_fs *c, bool direct, bool kicked, + u64 min_nr, u64 min_key_cache, u64 prereserved, u64 prereserved_total, u64 btree_cache_dirty, u64 btree_cache_total, u64 btree_key_cache_dirty, u64 btree_key_cache_total), - TP_ARGS(c, min_nr, prereserved, prereserved_total, + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, btree_cache_dirty, btree_cache_total, btree_key_cache_dirty, btree_key_cache_total), TP_STRUCT__entry( __field(dev_t, dev ) + __field(bool, direct ) + __field(bool, kicked ) __field(u64, min_nr ) + __field(u64, min_key_cache ) __field(u64, prereserved ) __field(u64, prereserved_total ) __field(u64, btree_cache_dirty ) @@ -163,7 +167,10 @@ TRACE_EVENT(journal_reclaim_start, TP_fast_assign( __entry->dev = c->dev; + __entry->direct = direct; + __entry->kicked = kicked; __entry->min_nr = min_nr; + __entry->min_key_cache = min_key_cache; __entry->prereserved = prereserved; __entry->prereserved_total = prereserved_total; __entry->btree_cache_dirty = btree_cache_dirty; @@ -172,9 +179,12 @@ TRACE_EVENT(journal_reclaim_start, __entry->btree_key_cache_total = btree_key_cache_total; ), - TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->direct, + __entry->kicked, __entry->min_nr, + __entry->min_key_cache, __entry->prereserved, __entry->prereserved_total, __entry->btree_cache_dirty, @@ -197,45 +207,13 @@ TRACE_EVENT(journal_reclaim_finish, __entry->nr_flushed = nr_flushed; ), - TP_printk("%d%d flushed %llu", + TP_printk("%d,%d flushed %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_flushed) ); /* allocator: */ -TRACE_EVENT(do_discards, - TP_PROTO(struct bch_fs *c, u64 seen, u64 open, - u64 need_journal_commit, u64 discarded, int ret), - TP_ARGS(c, seen, open, need_journal_commit, discarded, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, seen ) - __field(u64, open ) - __field(u64, need_journal_commit ) - __field(u64, discarded ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->seen = seen; - __entry->open = open; - __entry->need_journal_commit = need_journal_commit; - __entry->discarded = discarded; - __entry->ret = ret; - ), - - TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->seen, - __entry->open, - __entry->need_journal_commit, - __entry->discarded, - __entry->ret) -); - /* bset.c: */ DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -367,6 +345,11 @@ DEFINE_EVENT(btree_node, btree_merge, TP_ARGS(c, b) ); +DEFINE_EVENT(btree_node, btree_rewrite, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + DEFINE_EVENT(btree_node, btree_set_root, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) @@ -440,79 +423,18 @@ TRACE_EVENT(btree_node_relock_fail, /* Garbage collection */ -DEFINE_EVENT(btree_node, btree_gc_rewrite_node, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, - TP_PROTO(struct bch_fs *c, struct btree *b), - TP_ARGS(c, b) -); - -DEFINE_EVENT(bch_fs, gc_start, +DEFINE_EVENT(bch_fs, gc_gens_start, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); -DEFINE_EVENT(bch_fs, gc_end, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, +DEFINE_EVENT(bch_fs, gc_gens_end, TP_PROTO(struct bch_fs *c), TP_ARGS(c) ); /* Allocator */ -TRACE_EVENT(alloc_scan, - TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped), - TP_ARGS(ca, found, inc_gen, inc_gen_skipped), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, found ) - __field(u64, inc_gen ) - __field(u64, inc_gen_skipped ) - ), - - TP_fast_assign( - __entry->dev = ca->dev; - __entry->found = found; - __entry->inc_gen = inc_gen; - __entry->inc_gen_skipped = inc_gen_skipped; - ), - - TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->found, __entry->inc_gen, __entry->inc_gen_skipped) -); - -TRACE_EVENT(invalidate, - TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), - TP_ARGS(ca, offset, sectors), - - TP_STRUCT__entry( - __field(unsigned, sectors ) - __field(dev_t, dev ) - __field(__u64, offset ) - ), - - TP_fast_assign( - __entry->dev = ca->dev; - __entry->offset = offset, - __entry->sectors = sectors; - ), - - TP_printk("invalidated %u sectors at %d,%d sector=%llu", - __entry->sectors, - MAJOR(__entry->dev), - MINOR(__entry->dev), - __entry->offset) -); - TRACE_EVENT(bucket_alloc, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), TP_ARGS(ca, alloc_reserve), @@ -579,6 +501,59 @@ TRACE_EVENT(bucket_alloc_fail, __entry->ret) ); +TRACE_EVENT(discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, int ret), + TP_ARGS(c, seen, open, need_journal_commit, discarded, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, discarded ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; + __entry->ret = ret; + ), + + TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->discarded, + __entry->ret) +); + +TRACE_EVENT(invalidate_bucket, + TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket), + TP_ARGS(c, dev, bucket), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = dev; + __entry->bucket = bucket; + ), + + TP_printk("%d:%d invalidated %u:%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket) +); + /* Moving IO */ DEFINE_EVENT(bkey, move_extent, @@ -586,7 +561,7 @@ DEFINE_EVENT(bkey, move_extent, TP_ARGS(k) ); -DEFINE_EVENT(bkey, move_alloc_fail, +DEFINE_EVENT(bkey, move_alloc_mem_fail, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); @@ -670,7 +645,7 @@ TRACE_EVENT(copygc_wait, __entry->wait_amount, __entry->until) ); -DECLARE_EVENT_CLASS(transaction_restart, +DECLARE_EVENT_CLASS(transaction_event, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip), @@ -688,55 +663,61 @@ DECLARE_EVENT_CLASS(transaction_restart, TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) ); -DEFINE_EVENT(transaction_restart, transaction_restart_ip, +DEFINE_EVENT(transaction_event, transaction_commit, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip), + TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_event, transaction_restart_ip, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, +DEFINE_EVENT(transaction_event, trans_restart_journal_res_get, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, +DEFINE_EVENT(transaction_event, trans_restart_journal_preres_get, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, +DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, +DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_traverse_all, +DEFINE_EVENT(transaction_event, trans_traverse_all, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, +DEFINE_EVENT(transaction_event, trans_restart_mark_replicas, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) ); -DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, +DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_PROTO(const char *trans_fn, unsigned long caller_ip), TP_ARGS(trans_fn, caller_ip) diff --git a/libbcachefs.c b/libbcachefs.c index 8ba0194..4fe2c3d 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -689,7 +689,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs) struct dirent *d; dev_names devs; - darray_init(devs); + darray_init(&devs); while ((errno = 0), (d = readdir(dir))) { struct dev_name n = { 0, NULL, NULL }; @@ -713,7 +713,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs) n.label = read_file_str(fs.sysfs_fd, label_attr); free(label_attr); - darray_push(devs, n); + darray_push(&devs, n); } closedir(dir); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7be4829..32ebf6c 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -382,7 +382,8 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return -EINVAL; } - if (!a.v->io_time[READ]) { + if (!a.v->io_time[READ] && + test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { pr_buf(err, "cached bucket with read_time == 0"); return -EINVAL; } @@ -540,6 +541,7 @@ err: } int bch2_trans_mark_alloc(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { @@ -587,7 +589,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - old_lru = alloc_lru_idx(old_a); new_lru = alloc_lru_idx(*new_a); @@ -1065,7 +1066,7 @@ static void bch2_do_discards_work(struct work_struct *work) percpu_ref_put(&c->writes); - trace_do_discards(c, seen, open, need_journal_commit, discarded, ret); + trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret); } void bch2_do_discards(struct bch_fs *c) @@ -1087,6 +1088,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, POS(ca->dev_idx, 0), 0); +next_lru: k = bch2_btree_iter_peek(&lru_iter); ret = bkey_err(k); if (ret) @@ -1095,9 +1097,20 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) if (!k.k || k.k->p.inode != ca->dev_idx) goto out; - if (bch2_trans_inconsistent_on(k.k->type != KEY_TYPE_lru, trans, - "non lru key in lru btree")) - goto out; + if (k.k->type != KEY_TYPE_lru) { + pr_buf(&buf, "non lru key in lru btree:\n "); + bch2_bkey_val_to_text(&buf, c, k); + + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + bch2_btree_iter_advance(&lru_iter); + goto next_lru; + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; + goto out; + } + } idx = k.k->p.offset; bucket = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); @@ -1110,13 +1123,19 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) if (idx != alloc_lru_idx(a->v)) { pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); pr_buf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ret = -EINVAL; - goto out; + + if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { + bch_err(c, "%s", buf.buf); + bch2_btree_iter_advance(&lru_iter); + goto next_lru; + } else { + bch2_trans_inconsistent(trans, "%s", buf.buf); + ret = -EINVAL; + goto out; + } } SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); @@ -1129,6 +1148,10 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca) ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, BTREE_TRIGGER_BUCKET_INVALIDATE); + if (ret) + goto out; + + trace_invalidate_bucket(c, a->k.p.inode, a->k.p.offset); out: bch2_trans_iter_exit(trans, &alloc_iter); bch2_trans_iter_exit(trans, &lru_iter); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 2bc622b..ff366e6 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -125,8 +125,8 @@ static inline bool bkey_is_alloc(const struct bkey *k) int bch2_alloc_read(struct bch_fs *); -int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, - struct bkey_i *, unsigned); +int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_do_discards(struct bch_fs *); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index c2af360..88ec860 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -276,10 +276,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc u64 *skipped_open, u64 *skipped_need_journal_commit, u64 *skipped_nouse, + struct bkey_s_c freespace_k, struct closure *cl) { struct bch_fs *c = trans->c; - struct btree_iter iter; + struct btree_iter iter = { NULL }; struct bkey_s_c k; struct open_bucket *ob; struct bch_alloc_v4 a; @@ -288,6 +289,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct printbuf buf = PRINTBUF; int ret; + if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { + pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" + " freespace key ", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -298,29 +309,26 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc bch2_alloc_to_v4(k, &a); - if (bch2_fs_inconsistent_on(a.data_type != BCH_DATA_free, c, - "non free bucket in freespace btree (state %s)\n" - " %s\n" - " at %llu (genbits %u)", - bch2_data_types[a.data_type], - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - free_entry, genbits)) { + if (genbits != (alloc_freespace_genbits(a) >> 56)) { + pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(a) >> 56); + bch2_bkey_val_to_text(&buf, c, freespace_k); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); ob = ERR_PTR(-EIO); goto err; - } - if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c, - "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" - " %s", - genbits, alloc_freespace_genbits(a) >> 56, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ob = ERR_PTR(-EIO); - goto err; } - if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c, - "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)", - b, ca->mi.first_bucket, ca->mi.nbuckets)) { + if (a.data_type != BCH_DATA_free) { + pr_buf(&buf, "non free bucket in freespace btree\n" + " freespace key "); + bch2_bkey_val_to_text(&buf, c, freespace_k); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); ob = ERR_PTR(-EIO); goto err; } @@ -446,13 +454,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, BUG_ON(ca->new_fs_bucket_idx); - for_each_btree_key(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, *cur_bucket), 0, k, ret) { + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, *cur_bucket), 0, k, ret) { if (k.k->p.inode != ca->dev_idx) break; for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k)); - *cur_bucket != k.k->p.offset && !ob; + *cur_bucket < k.k->p.offset && !ob; (*cur_bucket)++) { if (btree_trans_too_many_iters(trans)) { ob = ERR_PTR(-EINTR); @@ -466,7 +474,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, skipped_open, skipped_need_journal_commit, skipped_nouse, - cl); + k, cl); } if (ob) break; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index e29a089..2eced20 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -494,11 +494,6 @@ struct bch_dev { enum { /* startup: */ - BCH_FS_ALLOC_CLEAN, - BCH_FS_INITIAL_GC_DONE, - BCH_FS_INITIAL_GC_UNFIXED, - BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_FSCK_DONE, BCH_FS_STARTED, BCH_FS_MAY_GO_RW, BCH_FS_RW, @@ -508,17 +503,22 @@ enum { BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, + BCH_FS_CLEAN_SHUTDOWN, + + /* fsck passes: */ + BCH_FS_TOPOLOGY_REPAIR_DONE, + BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ + BCH_FS_CHECK_LRUS_DONE, + BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, /* errors: */ BCH_FS_ERROR, BCH_FS_TOPOLOGY_ERROR, BCH_FS_ERRORS_FIXED, BCH_FS_ERRORS_NOT_FIXED, - - /* misc: */ - BCH_FS_NEED_ANOTHER_GC, - BCH_FS_DELETED_NODES, - BCH_FS_REBUILD_REPLICAS, }; struct btree_debug { @@ -585,6 +585,7 @@ struct bch_fs { struct list_head list; struct kobject kobj; + struct kobject counters_kobj; struct kobject internal; struct kobject opts_dir; struct kobject time_stats; @@ -901,12 +902,15 @@ struct bch_fs { u64 last_bucket_seq_cleanup; - /* The rest of this all shows up in sysfs */ + /* TODO rewrite as counters - The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; atomic_long_t extent_migrate_done; atomic_long_t extent_migrate_raced; atomic_long_t bucket_alloc_fail; + u64 counters_on_mount[BCH_COUNTER_NR]; + u64 __percpu *counters; + unsigned btree_gc_periodic:1; unsigned copy_gc_enabled:1; bool promote_whole_extents; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index cc279ab..1bea79c 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1086,7 +1086,8 @@ struct bch_sb_field { x(clean, 6) \ x(replicas, 7) \ x(journal_seq_blacklist, 8) \ - x(journal_v2, 9) + x(journal_v2, 9) \ + x(counters, 10) enum bch_sb_field_type { #define x(f, nr) BCH_SB_FIELD_##f = nr, @@ -1319,6 +1320,25 @@ struct bch_sb_field_disk_groups { struct bch_disk_group entries[0]; } __attribute__((packed, aligned(8))); +/* BCH_SB_FIELD_counters */ + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[0]; +}; + /* * On clean shutdown, store btree roots and current journal sequence number in * the superblock: diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 4889177..db894b4 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -27,8 +27,8 @@ struct bkey_ops { void (*swab)(struct bkey_s); bool (*key_normalize)(struct bch_fs *, struct bkey_s); bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); - int (*trans_trigger)(struct btree_trans *, struct bkey_s_c, - struct bkey_i *, unsigned); + int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); void (*compat)(enum btree_id id, unsigned version, @@ -80,16 +80,80 @@ static inline int bch2_mark_key(struct btree_trans *trans, : 0; } -static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, - struct bkey_i *new, unsigned flags) +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, + __BTREE_TRIGGER_NOATOMIC, +}; + +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + +#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) + +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + +#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_alloc_v4)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) + +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) { const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; return ops->trans_trigger - ? ops->trans_trigger(trans, old, new, flags) + ? ops->trans_trigger(trans, btree_id, level, old, new, flags) : 0; } +static inline int bch2_trans_mark_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = old.k->p; + + return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, + BTREE_TRIGGER_OVERWRITE|flags); +} + +static inline int bch2_trans_mark_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_i *new, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = new->k.p; + + return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); +} + void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index d027526..389b5a7 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1745,18 +1745,14 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) */ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) { - u64 start_time = local_clock(); unsigned iter = 0; int ret; lockdep_assert_held(&c->state_lock); - trace_gc_start(c); down_write(&c->gc_lock); - /* flush interior btree updates: */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + bch2_btree_interior_updates_flush(c); ret = bch2_gc_start(c, metadata_only) ?: bch2_gc_alloc_start(c, metadata_only) ?: @@ -1845,9 +1841,6 @@ out: up_write(&c->gc_lock); - trace_gc_end(c); - bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); - /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: @@ -1984,6 +1977,7 @@ int bch2_gc_gens(struct bch_fs *c) if (!mutex_trylock(&c->gc_gens_lock)) return 0; + trace_gc_gens_start(c); down_read(&c->gc_lock); bch2_trans_init(&trans, c, 0, 0); @@ -2035,6 +2029,7 @@ int bch2_gc_gens(struct bch_fs *c) c->gc_count++; bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + trace_gc_gens_end(c); err: for_each_member_device(ca, c, i) { kvfree(ca->oldest_gen); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index f283254..7a88392 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -820,10 +820,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, printbuf_reset(&buf); if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { printbuf_reset(&buf); - pr_buf(&buf, "invalid bkey:\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - pr_buf(&buf, " \n"); + pr_buf(&buf, "invalid bkey: "); bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); @@ -1081,10 +1081,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, !bversion_cmp(u.k->version, MAX_VERSION))) { printbuf_reset(&buf); - pr_buf(&buf, "invalid bkey\n "); - bch2_bkey_val_to_text(&buf, c, u.s_c); - pr_buf(&buf, "\n "); + pr_buf(&buf, "invalid bkey: "); bch2_bkey_val_invalid(c, u.s_c, READ, &buf); + pr_buf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); @@ -2102,29 +2102,33 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, } } -static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) +static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) { struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; unsigned i; + bool ret = false; restart: rcu_read_lock(); for_each_cached_btree(b, c, tbl, i, pos) if (test_bit(flag, &b->flags)) { rcu_read_unlock(); wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + ret = true; goto restart; } rcu_read_unlock(); + + return ret; } -void bch2_btree_flush_all_reads(struct bch_fs *c) +bool bch2_btree_flush_all_reads(struct bch_fs *c) { - __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); + return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); } -void bch2_btree_flush_all_writes(struct bch_fs *c) +bool bch2_btree_flush_all_writes(struct bch_fs *c) { - __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index d818d87..8af8536 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -152,8 +152,8 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); } -void bch2_btree_flush_all_reads(struct bch_fs *); -void bch2_btree_flush_all_writes(struct bch_fs *); +bool bch2_btree_flush_all_reads(struct bch_fs *); +bool bch2_btree_flush_all_writes(struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 7f7bf13..e7541af 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1527,6 +1527,30 @@ static inline bool btree_path_good_node(struct btree_trans *trans, return true; } +static void btree_path_set_level_up(struct btree_path *path) +{ + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +} + +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) +{ + unsigned l; + + path->level = new_level; + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); +} + static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, struct btree_path *path, int check_pos) @@ -2100,7 +2124,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) struct btree_trans *trans = iter->trans; struct btree_path *path = iter->path; struct btree *b = NULL; - unsigned l; int ret; BUG_ON(trans->restarted); @@ -2113,10 +2136,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* got to end? */ if (!btree_path_node(path, path->level + 1)) { - btree_node_unlock(path, path->level); - path->l[path->level].b = BTREE_ITER_NO_NODE_UP; - path->level++; - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_level_up(path); return NULL; } @@ -2148,14 +2168,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - path->level = iter->min_depth; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(path, l); - - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_iter_verify(iter); + btree_path_set_level_down(trans, path, iter->min_depth); ret = bch2_btree_path_traverse(trans, path, iter->flags); if (ret) @@ -2186,15 +2199,23 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - struct bpos pos = iter->k.p; - bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_cmp(pos, SPOS_MAX) - : bkey_cmp(pos, SPOS_MAX)) != 0; + if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { + struct bpos pos = iter->k.p; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, SPOS_MAX) + : bkey_cmp(pos, SPOS_MAX)) != 0; - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } else { + if (!btree_path_node(iter->path, iter->path->level)) + return true; + + iter->advanced = true; + return false; + } } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -2377,6 +2398,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + if (iter->update_path) { bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); @@ -2494,6 +2517,100 @@ out: return k; } +/** + * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal + * to iterator's current position, returning keys from every level of the btree. + * For keys at different levels of the btree that compare equal, the key from + * the lower level (leaf) is returned first. + */ +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + BUG_ON(iter->path->level < iter->min_depth); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out; + } + + /* Already at end? */ + if (!btree_path_node(iter->path, iter->path->level)) { + k = bkey_s_c_null; + goto out; + } + + k = btree_path_level_peek_all(trans->c, + &iter->path->l[iter->path->level], &iter->k); + + /* Check if we should go up to the parent node: */ + if (!k.k || + (iter->advanced && + !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; + btree_path_set_level_up(iter->path); + iter->advanced = false; + continue; + } + + /* + * Check if we should go back down to a leaf: + * If we're not in a leaf node, we only return the current key + * if it exactly matches iter->pos - otherwise we first have to + * go back to the leaf: + */ + if (iter->path->level != iter->min_depth && + (iter->advanced || + !k.k || + bpos_cmp(iter->pos, k.k->p))) { + btree_path_set_level_down(trans, iter->path, iter->min_depth); + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + /* Check if we should go to the next key: */ + if (iter->path->level == iter->min_depth && + iter->advanced && + k.k && + !bpos_cmp(iter->pos, k.k->p)) { + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + if (iter->advanced && + iter->path->level == iter->min_depth && + bpos_cmp(k.k->p, iter->pos)) + iter->advanced = false; + + BUG_ON(iter->advanced); + BUG_ON(!k.k); + break; + } + + iter->pos = k.k->p; +out: + iter->path->should_be_locked = true; + bch2_btree_iter_verify(iter); + + return k; +} + /** * bch2_btree_iter_next: returns first key greater than iterator's current * position @@ -2650,9 +2767,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; - EBUG_ON(iter->path->level); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2687,7 +2805,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && (next_update = bch2_journal_keys_peek_slot(trans->c, - iter->btree_id, 0, iter->pos))) { + iter->btree_id, + iter->path->level, + iter->pos))) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); goto out; @@ -2704,6 +2824,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos next; + EBUG_ON(iter->path->level); + if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; struct bpos end = iter->pos; @@ -2802,6 +2924,9 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; unsigned i; + if (!bch2_debug_check_iterators) + return; + trans_for_each_path_inorder(trans, path, i) { if (prev && btree_path_cmp(prev, path) > 0) { bch2_dump_trans_paths_updates(trans); @@ -2919,6 +3044,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, { EBUG_ON(trans->restarted); + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_node_type_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; @@ -2934,12 +3062,6 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) flags |= BTREE_ITER_WITH_JOURNAL; - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; - iter->trans = trans; iter->path = NULL; iter->update_path = NULL; @@ -2965,6 +3087,12 @@ void bch2_trans_iter_init(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags) { + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + __bch2_trans_iter_init(trans, iter, btree_id, pos, 0, 0, flags, _RET_IP_); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index f670029..dad05ea 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -212,6 +212,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); + static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { return bch2_btree_iter_peek_upto(iter, SPOS_MAX); @@ -313,9 +315,9 @@ static inline int bkey_err(struct bkey_s_c k) static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_SLOTS - ? bch2_btree_iter_peek_slot(iter) - : bch2_btree_iter_peek(iter); + return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : + flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek(iter); } static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index ab394c2..a575189 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -236,6 +236,13 @@ static int btree_key_cache_fill(struct btree_trans *trans, */ new_u64s = k.k->u64s + 1; + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + new_u64s = min(256U, (new_u64s * 3) / 2); + if (new_u64s > ck->u64s) { new_u64s = roundup_pow_of_two(new_u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 3438e08..e4ed46a 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -182,22 +182,16 @@ struct btree_node_iter { * Iterate over all possible positions, synthesizing deleted keys for holes: */ #define BTREE_ITER_SLOTS (1 << 0) +#define BTREE_ITER_ALL_LEVELS (1 << 1) /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 1) +#define BTREE_ITER_INTENT (1 << 2) /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 2) -/* - * Indicates that this iterator should not be reused until transaction commit, - * either because a pending update references it or because the update depends - * on that particular key being locked (e.g. by the str_hash code, for hash - * table consistency) - */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3) +#define BTREE_ITER_PREFETCH (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos @@ -282,7 +276,8 @@ struct btree_iter { struct btree_path *key_cache_path; enum btree_id btree_id:4; - unsigned min_depth:4; + unsigned min_depth:3; + unsigned advanced:1; /* btree_iter_copy starts here: */ u16 flags; @@ -639,42 +634,6 @@ static inline bool btree_type_has_snapshots(enum btree_id id) return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; } -enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, - __BTREE_UPDATE_KEY_CACHE_RECLAIM, - - __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - - __BTREE_TRIGGER_INSERT, - __BTREE_TRIGGER_OVERWRITE, - - __BTREE_TRIGGER_GC, - __BTREE_TRIGGER_BUCKET_INVALIDATE, - __BTREE_TRIGGER_NOATOMIC, -}; - -#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) - -#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) - -#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) - -#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) - -#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ - ((1U << KEY_TYPE_alloc)| \ - (1U << KEY_TYPE_alloc_v2)| \ - (1U << KEY_TYPE_alloc_v3)| \ - (1U << KEY_TYPE_alloc_v4)| \ - (1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)| \ - (1U << KEY_TYPE_inode_v2)| \ - (1U << KEY_TYPE_snapshot)) - static inline bool btree_node_type_needs_gc(enum btree_node_type type) { return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index b1aa77b..686e51d 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -381,16 +381,13 @@ static void bch2_btree_reserve_put(struct btree_update *as) struct bch_fs *c = as->c; struct prealloc_nodes *p; - mutex_lock(&c->btree_reserve_cache_lock); - for (p = as->prealloc_nodes; p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); p++) { while (p->nr) { struct btree *b = p->b[--p->nr]; - six_lock_intent(&b->c.lock, NULL, NULL); - six_lock_write(&b->c.lock, NULL, NULL); + mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr < ARRAY_SIZE(c->btree_reserve_cache)) { @@ -404,13 +401,15 @@ static void bch2_btree_reserve_put(struct btree_update *as) bch2_open_buckets_put(c, &b->ob); } + mutex_unlock(&c->btree_reserve_cache_lock); + + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); __btree_node_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } } - - mutex_unlock(&c->btree_reserve_cache_lock); } static int bch2_btree_reserve_get(struct btree_update *as, @@ -506,20 +505,18 @@ static void bch2_btree_update_free(struct btree_update *as) mutex_unlock(&c->btree_interior_update_lock); } -static void btree_update_will_delete_key(struct btree_update *as, - struct bkey_i *k) +static void btree_update_add_key(struct btree_update *as, + struct keylist *keys, struct btree *b) { - BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > + struct bkey_i *k = &b->key; + + BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > ARRAY_SIZE(as->_old_keys)); - bch2_keylist_add(&as->old_keys, k); -} -static void btree_update_will_add_key(struct btree_update *as, - struct bkey_i *k) -{ - BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > - ARRAY_SIZE(as->_new_keys)); - bch2_keylist_add(&as->new_keys, k); + bkey_copy(keys->top, k); + bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; + + bch2_keylist_push(keys); } /* @@ -532,7 +529,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, struct bkey_i *k; int ret; - ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s); + ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); if (ret) return ret; @@ -543,14 +540,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, trans->journal_pin = &as->journal; - for_each_keylist_key(&as->new_keys, k) { - ret = bch2_trans_mark_new(trans, k, 0); + for_each_keylist_key(&as->old_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); if (ret) return ret; } - for_each_keylist_key(&as->old_keys, k) { - ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0); + for_each_keylist_key(&as->new_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); if (ret) return ret; } @@ -653,8 +654,8 @@ err: if (!ret) { i->journal_seq = cpu_to_le64( - max(journal_seq, - le64_to_cpu(i->journal_seq))); + max(journal_seq, + le64_to_cpu(i->journal_seq))); bch2_btree_add_journal_pin(c, b, journal_seq); } else { @@ -822,7 +823,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree mutex_unlock(&c->btree_interior_update_lock); - btree_update_will_add_key(as, &b->key); + btree_update_add_key(as, &as->new_keys, b); } /* @@ -875,7 +876,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b * btree_updates to point to this btree_update: */ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, - struct btree *b) + struct btree *b) { struct bch_fs *c = as->c; struct btree_update *p, *n; @@ -939,7 +940,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, */ btree_update_drop_new_node(c, b); - btree_update_will_delete_key(as, &b->key); + btree_update_add_key(as, &as->old_keys, b); as->old_nodes[as->nr_old_nodes] = b; as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; @@ -1095,11 +1096,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) list_del_init(&b->list); mutex_unlock(&c->btree_cache.lock); - if (b->c.level) - six_lock_pcpu_alloc(&b->c.lock); - else - six_lock_pcpu_free(&b->c.lock); - mutex_lock(&c->btree_root_lock); BUG_ON(btree_node_root(c, b) && (b->c.level < btree_node_root(c, b)->c.level || @@ -1249,13 +1245,14 @@ static struct btree *__btree_split_node(struct btree_update *as, struct bpos n1_pos; n2 = bch2_btree_node_alloc(as, n1->c.level); - bch2_btree_update_add_new_node(as, n2); n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); n2->key.k.p = n1->key.k.p; + bch2_btree_update_add_new_node(as, n2); + set1 = btree_bset_first(n1); set2 = btree_bset_first(n2); @@ -1412,7 +1409,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_interior_update_will_free_node(as, b); n1 = bch2_btree_node_alloc_replacement(as, b); - bch2_btree_update_add_new_node(as, n1); if (keys) btree_split_insert_keys(as, trans, path, n1, keys); @@ -1427,6 +1423,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); + bch2_btree_update_add_new_node(as, n1); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); @@ -1455,6 +1453,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_build_aux_trees(n1); six_unlock_write(&n1->c.lock); + bch2_btree_update_add_new_node(as, n1); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); if (parent) @@ -1723,7 +1723,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_interior_update_will_free_node(as, m); n = bch2_btree_node_alloc(as, b->c.level); - bch2_btree_update_add_new_node(as, n); SET_BTREE_NODE_SEQ(n->data, max(BTREE_NODE_SEQ(b->data), @@ -1731,8 +1730,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_set_min(n, prev->data->min_key); btree_set_max(n, next->data->max_key); - n->data->format = new_f; + bch2_btree_update_add_new_node(as, n); + + n->data->format = new_f; btree_node_set_format(n, new_f); bch2_btree_sort_into(c, n, prev); @@ -1797,10 +1798,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, as = bch2_btree_update_start(trans, iter->path, b->c.level, false, flags); ret = PTR_ERR_OR_ZERO(as); - if (ret) { - trace_btree_gc_rewrite_node_fail(c, b); + if (ret) goto out; - } bch2_btree_interior_update_will_free_node(as, b); @@ -1810,7 +1809,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, bch2_btree_build_aux_trees(n); six_unlock_write(&n->c.lock); - trace_btree_gc_rewrite_node(c, b); + trace_btree_rewrite(c, b); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -1915,11 +1914,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, int ret; if (!skip_triggers) { - ret = bch2_trans_mark_new(trans, new_key, 0); + ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), 0); if (ret) return ret; - ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0); + ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, + new_key, 0); if (ret) return ret; } @@ -1956,7 +1957,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - ret = darray_make_room(trans->extra_journal_entries, + ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(new_key->k.u64s)); if (ret) return ret; @@ -2158,19 +2159,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) mutex_unlock(&c->btree_interior_update_lock); } -size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) +static bool bch2_btree_interior_updates_pending(struct bch_fs *c) { - size_t ret = 0; - struct list_head *i; + bool ret; mutex_lock(&c->btree_interior_update_lock); - list_for_each(i, &c->btree_interior_update_list) - ret++; + ret = !list_empty(&c->btree_interior_update_list); mutex_unlock(&c->btree_interior_update_lock); return ret; } +bool bch2_btree_interior_updates_flush(struct bch_fs *c) +{ + bool ret = bch2_btree_interior_updates_pending(c); + + if (ret) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_pending(c)); + return ret; +} + void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) { struct btree_root *r; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index e72eb87..adfc6c2 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -309,7 +309,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); -size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); +bool bch2_btree_interior_updates_flush(struct bch_fs *); void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index ef90f7a..58bb687 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -478,16 +478,16 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; - return bch2_trans_mark_key(trans, old, i->k, + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, BTREE_TRIGGER_INSERT| BTREE_TRIGGER_OVERWRITE| i->flags) ?: 1; } else if (overwrite && !i->overwrite_trigger_run) { i->overwrite_trigger_run = true; - return bch2_trans_mark_old(trans, old, i->flags) ?: 1; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; } else if (!overwrite && !i->insert_trigger_run) { i->insert_trigger_run = true; - return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; } else { return 0; } @@ -1111,6 +1111,8 @@ int __bch2_trans_commit(struct btree_trans *trans) goto out_reset; } + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); trans->journal_u64s = trans->extra_journal_entries.nr; @@ -1159,6 +1161,8 @@ retry: if (ret) goto err; + + trace_transaction_commit(trans->fn, _RET_IP_); out: bch2_journal_preres_put(&c->journal, &trans->journal_preres); @@ -1753,7 +1757,7 @@ int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) struct jset_entry_log *l; int ret; - ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s)); + ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s)); if (ret) return ret; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 8202bf1..c5c904d 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -378,10 +378,9 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, idx = bch2_replicas_entry_idx(c, r); if (idx < 0 && - (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err(c, "no replicas entry\n" - " while marking %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + fsck_err(c, "no replicas entry\n" + " while marking %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { percpu_up_read(&c->mark_lock); ret = bch2_mark_replicas(c, r); percpu_down_read(&c->mark_lock); @@ -596,9 +595,6 @@ int bch2_mark_alloc(struct btree_trans *trans, bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); return ret; } - - trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), - old_a.cached_sectors); } return 0; @@ -1447,6 +1443,7 @@ err: } int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { @@ -1585,6 +1582,7 @@ err: } int bch2_trans_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { @@ -1655,6 +1653,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, } int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) @@ -1671,6 +1670,7 @@ int bch2_trans_mark_inode(struct btree_trans *trans, } int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) @@ -1772,6 +1772,7 @@ err: } int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 8f360b3..3469327 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -202,41 +202,14 @@ int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsi int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, - struct bkey_i *, unsigned); - -static inline int bch2_trans_mark_old(struct btree_trans *trans, - struct bkey_s_c old, unsigned flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = old.k->p; - - return bch2_trans_mark_key(trans, old, &deleted, - BTREE_TRIGGER_OVERWRITE|flags); -} - -static inline int bch2_trans_mark_new(struct btree_trans *trans, - struct bkey_i *new, unsigned flags) -{ - struct bkey_i deleted; - - bkey_init(&deleted.k); - deleted.k.p = new->k.p; - - return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new, - BTREE_TRIGGER_INSERT|flags); -} - int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c new file mode 100644 index 0000000..25a6b38 --- /dev/null +++ b/libbcachefs/counters.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "super-io.h" +#include "counters.h" + +/* BCH_SB_FIELD_counters */ + +const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; + +static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) +{ + if (!ctrs) + return 0; + + return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; +}; + +static int bch2_sb_counters_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + return 0; +}; + +void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + for (i = 0; i < nr; i++) { + if (i < BCH_COUNTER_NR) + pr_buf(out, "%s", bch2_counter_names[i]); + else + pr_buf(out, "(unknown)"); + + pr_tab(out); + pr_buf(out, "%llu", le64_to_cpu(ctrs->d[i])); + pr_newline(out); + }; +}; + +int bch2_sb_counters_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + u64 val = 0; + + for (i = 0; i < BCH_COUNTER_NR; i++) + c->counters_on_mount[i] = 0; + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { + val = le64_to_cpu(ctrs->d[i]); + percpu_u64_set(&c->counters[i], val); + c->counters_on_mount[i] = val; + } + return 0; +}; + +int bch2_sb_counters_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + struct bch_sb_field_counters *ret; + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + if (nr < BCH_COUNTER_NR) { + ret = bch2_sb_resize_counters(&c->disk_sb, + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); + + if (ret) { + ctrs = ret; + nr = bch2_sb_counter_nr_entries(ctrs); + } + } + + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) + ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + return 0; +} + +int bch2_fs_counters_init(struct bch_fs *c) +{ + int ret = 0; + + c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); + + if (!c->counters) + return -ENOMEM; + + ret = bch2_sb_counters_to_cpu(c); + + return ret; +} + +const struct bch_sb_field_ops bch_sb_field_ops_counters = { + .validate = bch2_sb_counters_validate, + .to_text = bch2_sb_counters_to_text, +}; diff --git a/libbcachefs/counters.h b/libbcachefs/counters.h new file mode 100644 index 0000000..1f3207a --- /dev/null +++ b/libbcachefs/counters.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COUNTERS_H +#define _BCACHEFS_COUNTERS_H + +#include "bcachefs.h" +#include "super-io.h" + + +int bch2_sb_counters_to_cpu(struct bch_fs *c); + +int bch2_sb_counters_from_cpu(struct bch_fs *c); + +int bch2_fs_counters_init(struct bch_fs *c); + +extern const struct bch_sb_field_ops bch_sb_field_ops_counters; + +#endif // _BCACHEFS_COUNTERS_H diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 745b1cd..049e1d1 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -36,7 +36,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) } #define darray_make_room(_d, _more) \ - __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more)) + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) #define darray_top(_d) ((_d).data[(_d).nr]) @@ -45,7 +45,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) int _ret = darray_make_room((_d), 1); \ \ if (!_ret) \ - (_d).data[(_d).nr++] = (_item); \ + (_d)->data[(_d)->nr++] = (_item); \ _ret; \ }) @@ -54,7 +54,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) int _ret = darray_make_room((_d), 1); \ \ if (!_ret) \ - array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \ + array_insert_item((_d)->data, (_d)->nr, (_pos), (_item));\ _ret; \ }) @@ -63,13 +63,13 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) #define darray_init(_d) \ do { \ - (_d).data = NULL; \ - (_d).nr = (_d).size = 0; \ + (_d)->data = NULL; \ + (_d)->nr = (_d)->size = 0; \ } while (0) #define darray_exit(_d) \ do { \ - kfree((_d).data); \ + kfree((_d)->data); \ darray_init(_d); \ } while (0) diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 2d65ae3..3b869be 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * bch2_flags_to_text(out, bch2_btree_node_flags, b->flags); pr_newline(out); + pr_buf(out, "pcpu read locks: "); + pr_tab(out); + pr_buf(out, "%u", b->c.lock.readers != NULL); + pr_newline(out); + pr_buf(out, "written:"); pr_tab(out); pr_buf(out, "%u", b->written); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index dffbcff..2e541a4 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -308,8 +308,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc.uncompressed_size + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); - if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > + if (crc_l.uncompressed_size + crc_r.uncompressed_size > bch2_crc_field_size_max[extent_entry_type(en_l)]) return false; } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 05a0246..4cb2b2e 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -232,7 +232,10 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, return; mutex_lock(&inode->ei_quota_lock); - BUG_ON((s64) inode->v.i_blocks + sectors < 0); + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA @@ -2710,9 +2713,11 @@ int bch2_truncate(struct user_namespace *mnt_userns, U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); - WARN_ON(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal)); - + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal), c, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); if (unlikely(ret)) goto err; diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 963834d..f1abec9 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -560,7 +560,7 @@ struct inode_walker { static void inode_walker_exit(struct inode_walker *w) { - darray_exit(w->inodes); + darray_exit(&w->inodes); } static struct inode_walker inode_walker_init(void) @@ -575,7 +575,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, BUG_ON(bch2_inode_unpack(inode, &u)); - return darray_push(w->inodes, ((struct inode_walker_entry) { + return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, })); @@ -628,7 +628,7 @@ found: while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) --i; - ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]); + ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]); if (ret) return ret; @@ -740,8 +740,9 @@ static int hash_check_key(struct btree_trans *trans, if (hash_k.k->p.offset < hash) goto bad_hash; - for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash), - BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, desc.btree_id, + POS(hash_k.k->p.inode, hash), + BTREE_ITER_SLOTS, k, ret) { if (!bkey_cmp(k.k->p, hash_k.k->p)) break; @@ -759,16 +760,15 @@ static int hash_check_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); goto bad_hash; } - } out: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; bad_hash: - if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " + if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, " "hashed to %llu\n%s", - desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash, + bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE) return 0; @@ -1405,8 +1405,8 @@ static int check_dirent_target(struct btree_trans *trans, if (fsck_err_on(backpointer_exists && !target->bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - target->bi_inum)) { + "inode %llu type %s has multiple links but i_nlink 0", + target->bi_inum, bch2_d_types[d.v->d_type])) { target->bi_nlink++; target->bi_flags &= ~BCH_INODE_UNLINKED; @@ -1879,7 +1879,7 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) static int path_down(struct bch_fs *c, pathbuf *p, u64 inum, u32 snapshot) { - int ret = darray_push(*p, ((struct pathbuf_entry) { + int ret = darray_push(p, ((struct pathbuf_entry) { .inum = inum, .snapshot = snapshot, })); @@ -2037,7 +2037,7 @@ static int check_directory_structure(struct bch_fs *c) BUG_ON(ret == -EINTR); - darray_exit(path); + darray_exit(&path); bch2_trans_exit(&trans); return ret; @@ -2254,8 +2254,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, } if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, - "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)", - u.bi_inum, mode_to_type(u.bi_mode), + "inode %llu type %s has wrong i_nlink (%u, should be %u)", + u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 223344e..1ad4c7d 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1288,6 +1288,7 @@ void bch2_write(struct closure *cl) goto err; } + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -2200,6 +2201,7 @@ get_bio: if (rbio->bounce) trace_read_bounce(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 75be0a5..4c5b675 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -792,8 +792,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, int ret = 0; if (c) { - bch2_journal_block(&c->journal); bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); } bu = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 7fee0c0..59453dc 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -146,8 +146,6 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) return j->seq_ondisk + 1; } -void bch2_journal_set_has_inum(struct journal *, u64, u64); - static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 7c0aed9..e537a57 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1055,7 +1055,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) jlist.ret = 0; for_each_member_device(ca, c, iter) { - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + if (!c->opts.fsck && !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; @@ -1212,10 +1212,9 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) bch2_replicas_entry_to_text(&buf, &replicas.e); if (!degraded && - (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, - "superblock not marked as containing replicas %s", - buf.buf))) { + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", + buf.buf)) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) goto err; @@ -1442,7 +1441,8 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - journal_reclaim_kick(&c->journal); + if (j->watermark) + journal_reclaim_kick(&c->journal); /* also must come before signalling write completion: */ closure_debug_destroy(cl); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index a9f7d5a..fdc94e8 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -589,7 +589,7 @@ static u64 journal_seq_to_flush(struct journal *j) * 512 journal entries or 25% of all journal buckets, then * journal_next_bucket() should not stall. */ -static int __bch2_journal_reclaim(struct journal *j, bool direct) +static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool kthread = (current->flags & PF_KTHREAD) != 0; @@ -638,8 +638,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) min_nr = 1; - trace_journal_reclaim_start(c, - min_nr, + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); + + trace_journal_reclaim_start(c, direct, kicked, + min_nr, min_key_cache, j->prereserved.reserved, j->prereserved.remaining, atomic_read(&c->btree_cache.dirty), @@ -647,8 +649,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); - nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr, min_key_cache); @@ -669,7 +669,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) int bch2_journal_reclaim(struct journal *j) { - return __bch2_journal_reclaim(j, true); + return __bch2_journal_reclaim(j, true, true); } static int bch2_journal_reclaim_thread(void *arg) @@ -685,10 +685,12 @@ static int bch2_journal_reclaim_thread(void *arg) j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { + bool kicked = j->reclaim_kicked; + j->reclaim_kicked = false; mutex_lock(&j->reclaim_lock); - ret = __bch2_journal_reclaim(j, false); + ret = __bch2_journal_reclaim(j, false, kicked); mutex_unlock(&j->reclaim_lock); now = jiffies; diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index 506044e..6d98431 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "journal_sb.h" +#include "darray.h" #include @@ -142,12 +143,6 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, } for (i = 0; i + 1 < nr; i++) { - if (b[i].end == b[i + 1].start) { - pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu", - b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); - goto err; - } - if (b[i].end > b[i + 1].start) { pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); @@ -219,5 +214,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) } } + BUG_ON(dst + 1 != nr); + return 0; } diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index fe9d157..ce23b38 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -204,7 +204,9 @@ int bch2_check_lrus(struct bch_fs *c, bool initial) for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, bch2_check_lru_key(&trans, &iter, initial)); if (ret) break; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 6defc33..5345697 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -175,10 +175,7 @@ next: goto err; } - /* flush relevant btree updates */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - + bch2_btree_interior_updates_flush(c); ret = 0; err: bch2_trans_exit(&trans); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 1de2135..f1fb2ab 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -125,7 +125,7 @@ next: } } bch2_trans_iter_exit(trans, &iter); - darray_exit(s.ids); + darray_exit(&s.ids); return ret; } @@ -574,6 +574,7 @@ static int bch2_move_extent(struct btree_trans *trans, atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); trace_move_extent(k.k); @@ -596,7 +597,7 @@ err_free_pages: err_free: kfree(io); err: - trace_move_alloc_fail(k.k); + trace_move_alloc_mem_fail(k.k); return ret; } @@ -941,9 +942,7 @@ next: if (ret) bch_err(c, "error %i in bch2_move_btree", ret); - /* flush relevant btree updates */ - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + bch2_btree_interior_updates_flush(c); progress_list_del(c, stats); return ret; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 8bc67d0..85f0296 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -316,11 +316,6 @@ enum opt_type { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ - x(rebuild_replicas, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index a9b4ad1..ff483ff 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -147,7 +147,7 @@ static void journal_iters_fix(struct bch_fs *c) /* * If an iterator points one after the key we just inserted, - * and the key we just inserted compares >= the iterator's position, + * and the key we just inserted compares > the iterator's position, * decrement the iterator so it points at the key we just inserted: */ list_for_each_entry(iter, &c->journal_iters, journal.list) @@ -155,7 +155,7 @@ static void journal_iters_fix(struct bch_fs *c) iter->last && iter->b->c.btree_id == n->btree_id && iter->b->c.level == n->level && - bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0) + bpos_cmp(n->k->k.p, iter->unpacked.p) > 0) iter->journal.idx = keys->gap - 1; } @@ -994,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c) if (ret) return ret; - bkey_subvolume_init(&root_volume.k_i); root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; root_volume.v.flags = 0; @@ -1087,12 +1086,6 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.fix_errors = FSCK_OPT_YES; } - if (!c->replicas.entries || - c->opts.rebuild_replicas) { - bch_info(c, "building replicas info"); - set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - } - if (!c->opts.nochanges) { if (c->sb.version < bcachefs_metadata_version_new_data_types) { bch_info(c, "version prior to new_data_types, upgrade and fsck required"); @@ -1102,6 +1095,12 @@ int bch2_fs_recovery(struct bch_fs *c) } } + if (c->opts.fsck && c->opts.norecovery) { + bch_err(c, "cannot select both norecovery and fsck"); + ret = -EINVAL; + goto err; + } + ret = bch2_blacklist_table_initialize(c); if (ret) { bch_err(c, "error initializing blacklist table"); @@ -1195,6 +1194,13 @@ use_clean: if (ret) goto err; + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type && !c->sb.clean) + atomic64_add(1 << 16, &c->key_version); + ret = read_btree_roots(c); if (ret) goto err; @@ -1217,17 +1223,9 @@ use_clean: goto err; bch_verbose(c, "stripes_read done"); - /* - * If we're not running fsck, this ensures bch2_fsck_err() calls are - * instead interpreted as bch2_inconsistent_err() calls: - */ - if (!c->opts.fsck) - set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_stripes_heap_start(c); - if (c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || - test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { + if (c->opts.fsck) { bool metadata_only = c->opts.norecovery; bch_info(c, "checking allocations"); @@ -1236,63 +1234,70 @@ use_clean: if (ret) goto err; bch_verbose(c, "done checking allocations"); - } - if (c->opts.fsck) { + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + bch_info(c, "checking need_discard and freespace btrees"); err = "error checking need_discard and freespace btrees"; ret = bch2_check_alloc_info(c); - if (ret) - goto err; - - ret = bch2_check_lrus(c, true); if (ret) goto err; bch_verbose(c, "done checking need_discard and freespace btrees"); - } - - bch2_stripes_heap_start(c); - - clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - /* - * Skip past versions that might have possibly been used (as nonces), - * but hadn't had their pointers written: - */ - if (c->sb.encryption_type && !c->sb.clean) - atomic64_add(1 << 16, &c->key_version); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); - if (c->opts.norecovery) - goto out; + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); + if (ret) + goto err; + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); - bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); - err = "journal replay failed"; - ret = bch2_journal_replay(c); - if (ret) - goto err; - if (c->opts.verbose || !c->sb.clean) - bch_info(c, "journal replay done"); + bch_info(c, "checking lrus"); + err = "error checking lrus"; + ret = bch2_check_lrus(c, true); + if (ret) + goto err; + bch_verbose(c, "done checking lrus"); - err = "error initializing freespace"; - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); - if (c->opts.fsck) { bch_info(c, "checking alloc to lru refs"); err = "error checking alloc to lru refs"; ret = bch2_check_alloc_to_lru_refs(c); if (ret) goto err; + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); ret = bch2_check_lrus(c, true); if (ret) goto err; bch_verbose(c, "done checking alloc to lru refs"); + } else { + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); + set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + + if (c->opts.norecovery) + goto out; + + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); + err = "journal replay failed"; + ret = bch2_journal_replay(c); + if (ret) + goto err; + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); } + err = "error initializing freespace"; + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { bch2_fs_lazy_rw(c); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 6a81eb9..a53a3d5 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -110,6 +110,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r } int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { @@ -124,7 +125,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, } } - return bch2_trans_mark_extent(trans, old, new, flags); + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); } /* indirect inline data */ @@ -153,6 +154,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, } int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index e0a9d8e..f9848dc 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -20,8 +20,8 @@ int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c, - struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); #define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ .key_invalid = bch2_reflink_v_invalid, \ @@ -36,6 +36,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_indirect_inline_data(struct btree_trans *, + enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 63a5739..81bdcb7 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -565,7 +565,7 @@ static int snapshot_id_add(snapshot_id_list *s, u32 id) { BUG_ON(snapshot_list_has_id(s, id)); - return darray_push(*s, id); + return darray_push(s, id); } static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, @@ -622,7 +622,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - darray_exit(equiv_seen); + darray_exit(&equiv_seen); return ret; } @@ -722,7 +722,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) } } err: - darray_exit(deleted); + darray_exit(&deleted); bch2_trans_exit(&trans); percpu_ref_put(&c->writes); } @@ -888,7 +888,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); s = c->snapshots_unlinked; - darray_init(c->snapshots_unlinked); + darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); if (!s.nr) @@ -905,7 +905,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) } } - darray_exit(s); + darray_exit(&s); } percpu_ref_put(&c->writes); diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index a442538..b1739d2 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -76,7 +76,7 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - int ret = darray_push(s->ids, id); + int ret = darray_push(&s->ids, id); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 1aaae14..a2b789b 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -17,6 +17,7 @@ #include "super-io.h" #include "super.h" #include "vstructs.h" +#include "counters.h" #include #include @@ -818,6 +819,8 @@ int bch2_write_super(struct bch_fs *c) SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + bch2_sb_counters_from_cpu(c); + for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 1401cb5..bdac2b7 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -44,6 +44,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "counters.h" #include #include @@ -71,6 +72,9 @@ struct kobj_type type ## _ktype = { \ static void bch2_fs_release(struct kobject *); static void bch2_dev_release(struct kobject *); +static void bch2_fs_counters_release(struct kobject *k) +{ +} static void bch2_fs_internal_release(struct kobject *k) { @@ -85,6 +89,7 @@ static void bch2_fs_time_stats_release(struct kobject *k) } static KTYPE(bch2_fs); +static KTYPE(bch2_fs_counters); static KTYPE(bch2_fs_internal); static KTYPE(bch2_fs_opts_dir); static KTYPE(bch2_fs_time_stats); @@ -188,57 +193,33 @@ static void __bch2_fs_read_only(struct bch_fs *c) { struct bch_dev *ca; unsigned i, clean_passes = 0; + u64 seq = 0; bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_gc_thread_stop(c); - /* - * Flush journal before stopping allocators, because flushing journal - * blacklist entries involves allocating new btree nodes: - */ - bch2_journal_flush_all_pins(&c->journal); - bch_verbose(c, "flushing journal and stopping allocators"); - bch2_journal_flush_all_pins(&c->journal); - do { clean_passes++; - if (bch2_journal_flush_all_pins(&c->journal)) - clean_passes = 0; - - /* - * In flight interior btree updates will generate more journal - * updates and btree updates (alloc btree): - */ - if (bch2_btree_interior_updates_nr_pending(c)) { - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); + if (bch2_btree_interior_updates_flush(c) || + bch2_journal_flush_all_pins(&c->journal) || + bch2_btree_flush_all_writes(c) || + seq != atomic64_read(&c->journal.seq)) { + seq = atomic64_read(&c->journal.seq); clean_passes = 0; } - flush_work(&c->btree_interior_update_work); - - if (bch2_journal_flush_all_pins(&c->journal)) - clean_passes = 0; } while (clean_passes < 2); - bch_verbose(c, "flushing journal and stopping allocators complete"); - - set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - flush_work(&c->btree_interior_update_work); + bch_verbose(c, "flushing journal and stopping allocators complete"); + if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); bch2_fs_journal_stop(&c->journal); - /* - * the journal kicks off btree writes via reclaim - wait for in flight - * writes after stopping journal: - */ - bch2_btree_flush_all_writes(c); - /* * After stopping journal: */ @@ -297,7 +278,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_ERROR, &c->flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && + test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && !c->opts.norecovery) { bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -388,7 +369,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); + clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); @@ -517,6 +498,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); kobject_put(&c->internal); @@ -585,6 +567,7 @@ static int bch2_fs_online(struct bch_fs *c) kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { bch_err(c, "error creating sysfs objects"); @@ -633,6 +616,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); c->minor = -1; c->disk_sb.fs_sb = true; @@ -796,7 +780,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_fsio_init(c); + bch2_fs_fsio_init(c) ?: + bch2_fs_counters_init(c); if (ret) goto err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index d3919fa..77e2ec7 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -40,7 +40,7 @@ #include "util.h" #define SYSFS_OPS(type) \ -struct sysfs_ops type ## _sysfs_ops = { \ +const struct sysfs_ops type ## _sysfs_ops = { \ .show = type ## _show, \ .store = type ## _store \ } @@ -55,6 +55,9 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ struct printbuf out = PRINTBUF; \ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ \ + if (out.pos && out.buf[out.pos - 1] != '\n') \ + pr_newline(&out); \ + \ if (!ret && out.allocation_failure) \ ret = -ENOMEM; \ \ @@ -191,6 +194,10 @@ read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); read_attribute(bucket_alloc_fail); +#define x(t, n, ...) read_attribute(t); +BCH_PERSISTENT_COUNTERS() +#undef x + rw_attribute(discard); rw_attribute(label); @@ -544,6 +551,47 @@ struct attribute *bch2_fs_files[] = { NULL }; +/* counters dir */ + +SHOW(bch2_fs_counters) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); + u64 counter = 0; + u64 counter_since_mount = 0; + + out->tabstops[0] = 32; + #define x(t, ...) \ + if (attr == &sysfs_##t) { \ + counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ + counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + pr_buf(out, "since mount:"); \ + pr_tab(out); \ + bch2_hprint(out, counter_since_mount << 9); \ + pr_newline(out); \ + \ + pr_buf(out, "since filesystem creation:"); \ + pr_tab(out); \ + bch2_hprint(out, counter << 9); \ + pr_newline(out); \ + } + BCH_PERSISTENT_COUNTERS() + #undef x + return 0; +} + +STORE(bch2_fs_counters) { + return 0; +} + +SYSFS_OPS(bch2_fs_counters); + +struct attribute *bch2_fs_counters_files[] = { +#define x(t, ...) \ + &sysfs_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; /* internal dir - just a wrapper */ SHOW(bch2_fs_internal) @@ -614,7 +662,7 @@ STORE(bch2_fs_opts_dir) { struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret = size, id = opt - bch2_opt_table; + int ret, id = opt - bch2_opt_table; char *tmp; u64 v; @@ -649,6 +697,8 @@ STORE(bch2_fs_opts_dir) bch2_rebalance_add_work(c, S64_MAX); rebalance_wakeup(c); } + + ret = size; err: percpu_ref_put(&c->writes); return ret; diff --git a/libbcachefs/sysfs.h b/libbcachefs/sysfs.h index 525fd05..222cd50 100644 --- a/libbcachefs/sysfs.h +++ b/libbcachefs/sysfs.h @@ -10,28 +10,32 @@ struct attribute; struct sysfs_ops; extern struct attribute *bch2_fs_files[]; +extern struct attribute *bch2_fs_counters_files[]; extern struct attribute *bch2_fs_internal_files[]; extern struct attribute *bch2_fs_opts_dir_files[]; extern struct attribute *bch2_fs_time_stats_files[]; extern struct attribute *bch2_dev_files[]; -extern struct sysfs_ops bch2_fs_sysfs_ops; -extern struct sysfs_ops bch2_fs_internal_sysfs_ops; -extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -extern struct sysfs_ops bch2_dev_sysfs_ops; +extern const struct sysfs_ops bch2_fs_sysfs_ops; +extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; +extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; +extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +extern const struct sysfs_ops bch2_dev_sysfs_ops; int bch2_opts_create_sysfs_files(struct kobject *); #else static struct attribute *bch2_fs_files[] = {}; +static struct attribute *bch2_fs_counters_files[] = {}; static struct attribute *bch2_fs_internal_files[] = {}; static struct attribute *bch2_fs_opts_dir_files[] = {}; static struct attribute *bch2_fs_time_stats_files[] = {}; static struct attribute *bch2_dev_files[] = {}; static const struct sysfs_ops bch2_fs_sysfs_ops; +static const struct sysfs_ops bch2_fs_counters_sysfs_ops; static const struct sysfs_ops bch2_fs_internal_sysfs_ops; static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index a2d6bb7..5143b60 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include diff --git a/tools-util.c b/tools-util.c index e338152..591e2a0 100644 --- a/tools-util.c +++ b/tools-util.c @@ -306,10 +306,10 @@ void ranges_sort_merge(ranges *r) if (t && t->end >= i->start) t->end = max(t->end, i->end); else - darray_push(tmp, *i); + darray_push(&tmp, *i); } - darray_exit(*r); + darray_exit(r); *r = tmp; } diff --git a/tools-util.h b/tools-util.h index a0e20eb..136d7d6 100644 --- a/tools-util.h +++ b/tools-util.h @@ -76,7 +76,7 @@ typedef DARRAY(struct range) ranges; static inline void range_add(ranges *data, u64 offset, u64 size) { - darray_push(*data, ((struct range) { + darray_push(data, ((struct range) { .start = offset, .end = offset + size }));