]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to bdf6d7c135 fixup! bcachefs: Kill journal buf bloom filter
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 2 May 2022 22:39:16 +0000 (18:39 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 2 May 2022 22:42:06 +0000 (18:42 -0400)
55 files changed:
.bcachefs_revision
cmd_debug.c
cmd_format.c
cmd_fs.c
cmd_migrate.c
doc/bcachefs-principles-of-operation.tex
include/trace/events/bcachefs.h
libbcachefs.c
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/counters.c [new file with mode: 0644]
libbcachefs/counters.h [new file with mode: 0644]
libbcachefs/darray.h
libbcachefs/debug.c
libbcachefs/extents.c
libbcachefs/fs-io.c
libbcachefs/fsck.c
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_sb.c
libbcachefs/lru.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/sysfs.h
libbcachefs/varint.c
tools-util.c
tools-util.h

index 20dd0bb791918d471e2ca2b729f22a56319c7b54..4fb144ee88879c2acdb04f62cd2980538d1d5204 100644 (file)
@@ -1 +1 @@
-4c2d3669b15475674b750244bb1e096849352bc8
+bdf6d7c1350497bc7b0be6027a51d9330645672d
index e29ceff6826978d65dfd28227208e795eca84e89..c2206dacdb5697ef168f365d92f92bde04869fac 100644 (file)
@@ -108,7 +108,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
 
        qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
                          max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
-       darray_exit(data);
+       darray_exit(&data);
 }
 
 int cmd_dump(int argc, char *argv[])
index c3debe0a95f15ea5638d78179f7117d5a92244e8..5c2bc8c01a5245b8df81868002c7ea9351b7e8a8 100644 (file)
@@ -197,9 +197,9 @@ int cmd_format(int argc, char *argv[])
                        initialize = false;
                        break;
                case O_no_opt:
-                       darray_push(device_paths, optarg);
+                       darray_push(&device_paths, optarg);
                        dev_opts.path = optarg;
-                       darray_push(devices, dev_opts);
+                       darray_push(&devices, dev_opts);
                        dev_opts.size = 0;
                        break;
                case O_quiet:
@@ -253,7 +253,7 @@ int cmd_format(int argc, char *argv[])
                free(opts.passphrase);
        }
 
-       darray_exit(devices);
+       darray_exit(&devices);
 
        if (initialize) {
                struct bch_opts mount_opts = bch2_opts_empty();
@@ -275,7 +275,7 @@ int cmd_format(int argc, char *argv[])
                bch2_fs_stop(c);
        }
 
-       darray_exit(device_paths);
+       darray_exit(&device_paths);
 
        return 0;
 }
index 4e955ea24c601ca697d239b4dd9ecce0df1713dd..ee3ea650dbc0953ff2a4488bede580c2264211b5 100644 (file)
--- a/cmd_fs.c
+++ b/cmd_fs.c
@@ -267,7 +267,7 @@ static void fs_usage_to_text(struct printbuf *out, const char *path)
                free(dev->dev);
                free(dev->label);
        }
-       darray_exit(dev_names);
+       darray_exit(&dev_names);
 
        bcache_fs_close(fs);
 }
index 08ec7de28004df6eca81c97b7fd933c9b502a434..b67fc02df994bdbaaead9db6901c64ed43265de7 100644 (file)
@@ -603,7 +603,7 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
 
        update_inode(c, &root_inode);
 
-       darray_exit(s.extents);
+       darray_exit(&s.extents);
        genradix_free(&s.hardlinks);
 }
 
index d5ac6ede3c7c420ac044448cbec5f675539e3273..89b39bf4733638b13c349d7f0f8789523dd7d068 100644 (file)
@@ -530,6 +530,8 @@ passed as mount parameters the persistent options are unmodified.
 
 \subsection{File and directory options}
 
+<say something here about how attrs must be set via bcachefs attr command>
+
 Options set on inodes (files and directories) are automatically inherited by
 their descendants, and inodes also record whether a given option was explicitly
 set or inherited from their parent. When renaming a directory would cause
index 1ae5e8885b465992e76bce575b1615c4c3719ece..b96b25741b68a01d94c5994157cbb70c36139634 100644 (file)
@@ -142,17 +142,21 @@ DEFINE_EVENT(bio, journal_write,
 );
 
 TRACE_EVENT(journal_reclaim_start,
-       TP_PROTO(struct bch_fs *c, u64 min_nr,
+       TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
+                u64 min_nr, u64 min_key_cache,
                 u64 prereserved, u64 prereserved_total,
                 u64 btree_cache_dirty, u64 btree_cache_total,
                 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, min_nr, prereserved, prereserved_total,
+       TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
                btree_cache_dirty, btree_cache_total,
                btree_key_cache_dirty, btree_key_cache_total),
 
        TP_STRUCT__entry(
                __field(dev_t,          dev                     )
+               __field(bool,           direct                  )
+               __field(bool,           kicked                  )
                __field(u64,            min_nr                  )
+               __field(u64,            min_key_cache           )
                __field(u64,            prereserved             )
                __field(u64,            prereserved_total       )
                __field(u64,            btree_cache_dirty       )
@@ -163,7 +167,10 @@ TRACE_EVENT(journal_reclaim_start,
 
        TP_fast_assign(
                __entry->dev                    = c->dev;
+               __entry->direct                 = direct;
+               __entry->kicked                 = kicked;
                __entry->min_nr                 = min_nr;
+               __entry->min_key_cache          = min_key_cache;
                __entry->prereserved            = prereserved;
                __entry->prereserved_total      = prereserved_total;
                __entry->btree_cache_dirty      = btree_cache_dirty;
@@ -172,9 +179,12 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->direct,
+                 __entry->kicked,
                  __entry->min_nr,
+                 __entry->min_key_cache,
                  __entry->prereserved,
                  __entry->prereserved_total,
                  __entry->btree_cache_dirty,
@@ -197,45 +207,13 @@ TRACE_EVENT(journal_reclaim_finish,
                __entry->nr_flushed     = nr_flushed;
        ),
 
-       TP_printk("%d%d flushed %llu",
+       TP_printk("%d,%d flushed %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_flushed)
 );
 
 /* allocator: */
 
-TRACE_EVENT(do_discards,
-       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
-                u64 need_journal_commit, u64 discarded, int ret),
-       TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            seen                    )
-               __field(u64,            open                    )
-               __field(u64,            need_journal_commit     )
-               __field(u64,            discarded               )
-               __field(int,            ret                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->seen                   = seen;
-               __entry->open                   = open;
-               __entry->need_journal_commit    = need_journal_commit;
-               __entry->discarded              = discarded;
-               __entry->ret                    = ret;
-       ),
-
-       TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->seen,
-                 __entry->open,
-                 __entry->need_journal_commit,
-                 __entry->discarded,
-                 __entry->ret)
-);
-
 /* bset.c: */
 
 DEFINE_EVENT(bpos, bkey_pack_pos_fail,
@@ -367,6 +345,11 @@ DEFINE_EVENT(btree_node, btree_merge,
        TP_ARGS(c, b)
 );
 
+DEFINE_EVENT(btree_node, btree_rewrite,
+       TP_PROTO(struct bch_fs *c, struct btree *b),
+       TP_ARGS(c, b)
+);
+
 DEFINE_EVENT(btree_node, btree_set_root,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
@@ -440,79 +423,18 @@ TRACE_EVENT(btree_node_relock_fail,
 
 /* Garbage collection */
 
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
-       TP_PROTO(struct bch_fs *c, struct btree *b),
-       TP_ARGS(c, b)
-);
-
-DEFINE_EVENT(bch_fs, gc_start,
+DEFINE_EVENT(bch_fs, gc_gens_start,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, gc_end,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
-);
-
-DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
+DEFINE_EVENT(bch_fs, gc_gens_end,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
 /* Allocator */
 
-TRACE_EVENT(alloc_scan,
-       TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
-       TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev             )
-               __field(u64,            found           )
-               __field(u64,            inc_gen         )
-               __field(u64,            inc_gen_skipped )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = ca->dev;
-               __entry->found          = found;
-               __entry->inc_gen        = inc_gen;
-               __entry->inc_gen_skipped = inc_gen_skipped;
-       ),
-
-       TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
-);
-
-TRACE_EVENT(invalidate,
-       TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
-       TP_ARGS(ca, offset, sectors),
-
-       TP_STRUCT__entry(
-               __field(unsigned,       sectors                 )
-               __field(dev_t,          dev                     )
-               __field(__u64,          offset                  )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = ca->dev;
-               __entry->offset         = offset,
-               __entry->sectors        = sectors;
-       ),
-
-       TP_printk("invalidated %u sectors at %d,%d sector=%llu",
-                 __entry->sectors,
-                 MAJOR(__entry->dev),
-                 MINOR(__entry->dev),
-                 __entry->offset)
-);
-
 TRACE_EVENT(bucket_alloc,
        TP_PROTO(struct bch_dev *ca, const char *alloc_reserve),
        TP_ARGS(ca, alloc_reserve),
@@ -579,6 +501,59 @@ TRACE_EVENT(bucket_alloc_fail,
                  __entry->ret)
 );
 
+TRACE_EVENT(discard_buckets,
+       TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+                u64 need_journal_commit, u64 discarded, int ret),
+       TP_ARGS(c, seen, open, need_journal_commit, discarded, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(u64,            seen                    )
+               __field(u64,            open                    )
+               __field(u64,            need_journal_commit     )
+               __field(u64,            discarded               )
+               __field(int,            ret                     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev                    = c->dev;
+               __entry->seen                   = seen;
+               __entry->open                   = open;
+               __entry->need_journal_commit    = need_journal_commit;
+               __entry->discarded              = discarded;
+               __entry->ret                    = ret;
+       ),
+
+       TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu ret %i",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->seen,
+                 __entry->open,
+                 __entry->need_journal_commit,
+                 __entry->discarded,
+                 __entry->ret)
+);
+
+TRACE_EVENT(invalidate_bucket,
+       TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket),
+       TP_ARGS(c, dev, bucket),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(u32,            dev_idx                 )
+               __field(u64,            bucket                  )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               __entry->dev_idx        = dev;
+               __entry->bucket         = bucket;
+       ),
+
+       TP_printk("%d:%d invalidated %u:%llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dev_idx, __entry->bucket)
+);
+
 /* Moving IO */
 
 DEFINE_EVENT(bkey, move_extent,
@@ -586,7 +561,7 @@ DEFINE_EVENT(bkey, move_extent,
        TP_ARGS(k)
 );
 
-DEFINE_EVENT(bkey, move_alloc_fail,
+DEFINE_EVENT(bkey, move_alloc_mem_fail,
        TP_PROTO(const struct bkey *k),
        TP_ARGS(k)
 );
@@ -670,7 +645,7 @@ TRACE_EVENT(copygc_wait,
                  __entry->wait_amount, __entry->until)
 );
 
-DECLARE_EVENT_CLASS(transaction_restart,
+DECLARE_EVENT_CLASS(transaction_event,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip),
@@ -688,55 +663,61 @@ DECLARE_EVENT_CLASS(transaction_restart,
        TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      transaction_restart_ip,
+DEFINE_EVENT(transaction_event,        transaction_commit,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip),
+       TP_ARGS(trans_fn, caller_ip)
+);
+
+DEFINE_EVENT(transaction_event,        transaction_restart_ip,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_blocked_journal_reclaim,
+DEFINE_EVENT(transaction_event,        trans_blocked_journal_reclaim,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
+DEFINE_EVENT(transaction_event,        trans_restart_journal_res_get,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_preres_get,
+DEFINE_EVENT(transaction_event,        trans_restart_journal_preres_get,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
+DEFINE_EVENT(transaction_event,        trans_restart_journal_reclaim,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_fault_inject,
+DEFINE_EVENT(transaction_event,        trans_restart_fault_inject,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_traverse_all,
+DEFINE_EVENT(transaction_event,        trans_traverse_all,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
+DEFINE_EVENT(transaction_event,        trans_restart_mark_replicas,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
 );
 
-DEFINE_EVENT(transaction_restart,      trans_restart_key_cache_raced,
+DEFINE_EVENT(transaction_event,        trans_restart_key_cache_raced,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
        TP_ARGS(trans_fn, caller_ip)
index 8ba01947fa7e2842352dd6adab2afea8f35dec19..4fe2c3db401a41afb2f79319986bb4819e31d310 100644 (file)
@@ -689,7 +689,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs)
        struct dirent *d;
        dev_names devs;
 
-       darray_init(devs);
+       darray_init(&devs);
 
        while ((errno = 0), (d = readdir(dir))) {
                struct dev_name n = { 0, NULL, NULL };
@@ -713,7 +713,7 @@ dev_names bchu_fs_get_devices(struct bchfs_handle fs)
                n.label = read_file_str(fs.sysfs_fd, label_attr);
                free(label_attr);
 
-               darray_push(devs, n);
+               darray_push(&devs, n);
        }
 
        closedir(dir);
index 7be4829790b621c47a67297a85ede2a400947110..32ebf6cc9e074a8ecc682e2fde616282846e64a4 100644 (file)
@@ -382,7 +382,8 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
                                return -EINVAL;
                        }
 
-                       if (!a.v->io_time[READ]) {
+                       if (!a.v->io_time[READ] &&
+                           test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) {
                                pr_buf(err, "cached bucket with read_time == 0");
                                return -EINVAL;
                        }
@@ -540,6 +541,7 @@ err:
 }
 
 int bch2_trans_mark_alloc(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
                          struct bkey_s_c old, struct bkey_i *new,
                          unsigned flags)
 {
@@ -587,7 +589,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
            !new_a->io_time[READ])
                new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-
        old_lru = alloc_lru_idx(old_a);
        new_lru = alloc_lru_idx(*new_a);
 
@@ -1065,7 +1066,7 @@ static void bch2_do_discards_work(struct work_struct *work)
 
        percpu_ref_put(&c->writes);
 
-       trace_do_discards(c, seen, open, need_journal_commit, discarded, ret);
+       trace_discard_buckets(c, seen, open, need_journal_commit, discarded, ret);
 }
 
 void bch2_do_discards(struct bch_fs *c)
@@ -1087,6 +1088,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
        bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
                             POS(ca->dev_idx, 0), 0);
+next_lru:
        k = bch2_btree_iter_peek(&lru_iter);
        ret = bkey_err(k);
        if (ret)
@@ -1095,9 +1097,20 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
        if (!k.k || k.k->p.inode != ca->dev_idx)
                goto out;
 
-       if (bch2_trans_inconsistent_on(k.k->type != KEY_TYPE_lru, trans,
-                                      "non lru key in lru btree"))
-               goto out;
+       if (k.k->type != KEY_TYPE_lru) {
+               pr_buf(&buf, "non lru key in lru btree:\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+                       bch2_btree_iter_advance(&lru_iter);
+                       goto next_lru;
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
 
        idx     = k.k->p.offset;
        bucket  = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
@@ -1110,13 +1123,19 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
        if (idx != alloc_lru_idx(a->v)) {
                pr_buf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n  ");
-
                bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
                pr_buf(&buf, "\n  ");
                bch2_bkey_val_to_text(&buf, c, k);
-               bch2_trans_inconsistent(trans, "%s", buf.buf);
-               ret = -EINVAL;
-               goto out;
+
+               if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) {
+                       bch_err(c, "%s", buf.buf);
+                       bch2_btree_iter_advance(&lru_iter);
+                       goto next_lru;
+               } else {
+                       bch2_trans_inconsistent(trans, "%s", buf.buf);
+                       ret = -EINVAL;
+                       goto out;
+               }
        }
 
        SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
@@ -1129,6 +1148,10 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
 
        ret = bch2_trans_update(trans, &alloc_iter, &a->k_i,
                                BTREE_TRIGGER_BUCKET_INVALIDATE);
+       if (ret)
+               goto out;
+
+       trace_invalidate_bucket(c, a->k.p.inode, a->k.p.offset);
 out:
        bch2_trans_iter_exit(trans, &alloc_iter);
        bch2_trans_iter_exit(trans, &lru_iter);
index 2bc622b305c258d9f4ffd48f86e324ef2b73b8dc..ff366e61ace51ea4af61b916c88ca88f24d05a1e 100644 (file)
@@ -125,8 +125,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
 
 int bch2_alloc_read(struct bch_fs *);
 
-int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
-                         struct bkey_i *, unsigned);
+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
+                         struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_check_alloc_info(struct bch_fs *);
 int bch2_check_alloc_to_lru_refs(struct bch_fs *);
 void bch2_do_discards(struct bch_fs *);
index c2af3600cf84d69d8cd8df063b97c100f06ed2f2..88ec8609e0cbe281a821a9a2efe1f2d072adabe0 100644 (file)
@@ -276,10 +276,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
                                            u64 *skipped_open,
                                            u64 *skipped_need_journal_commit,
                                            u64 *skipped_nouse,
+                                           struct bkey_s_c freespace_k,
                                            struct closure *cl)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        struct open_bucket *ob;
        struct bch_alloc_v4 a;
@@ -288,6 +289,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
        struct printbuf buf = PRINTBUF;
        int ret;
 
+       if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
+               pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
+                      "  freespace key ",
+                       ca->mi.first_bucket, ca->mi.nbuckets);
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
        bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
@@ -298,29 +309,26 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
 
        bch2_alloc_to_v4(k, &a);
 
-       if (bch2_fs_inconsistent_on(a.data_type != BCH_DATA_free, c,
-                       "non free bucket in freespace btree (state %s)\n"
-                       "  %s\n"
-                       "  at %llu (genbits %u)",
-                       bch2_data_types[a.data_type],
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
-                       free_entry, genbits)) {
+       if (genbits != (alloc_freespace_genbits(a) >> 56)) {
+               pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+                      "  freespace key ",
+                      genbits, alloc_freespace_genbits(a) >> 56);
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               pr_buf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
                ob = ERR_PTR(-EIO);
                goto err;
-       }
 
-       if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
-                       "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
-                       "  %s",
-                       genbits, alloc_freespace_genbits(a) >> 56,
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               ob = ERR_PTR(-EIO);
-               goto err;
        }
 
-       if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
-                       "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
-                       b, ca->mi.first_bucket, ca->mi.nbuckets)) {
+       if (a.data_type != BCH_DATA_free) {
+               pr_buf(&buf, "non free bucket in freespace btree\n"
+                      "  freespace key ");
+               bch2_bkey_val_to_text(&buf, c, freespace_k);
+               pr_buf(&buf, "\n  ");
+               bch2_bkey_val_to_text(&buf, c, k);
+               bch2_trans_inconsistent(trans, "%s", buf.buf);
                ob = ERR_PTR(-EIO);
                goto err;
        }
@@ -446,13 +454,13 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
 
        BUG_ON(ca->new_fs_bucket_idx);
 
-       for_each_btree_key(trans, iter, BTREE_ID_freespace,
-                          POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
+                                    POS(ca->dev_idx, *cur_bucket), 0, k, ret) {
                if (k.k->p.inode != ca->dev_idx)
                        break;
 
                for (*cur_bucket = max(*cur_bucket, bkey_start_offset(k.k));
-                    *cur_bucket != k.k->p.offset && !ob;
+                    *cur_bucket < k.k->p.offset && !ob;
                     (*cur_bucket)++) {
                        if (btree_trans_too_many_iters(trans)) {
                                ob = ERR_PTR(-EINTR);
@@ -466,7 +474,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                              skipped_open,
                                              skipped_need_journal_commit,
                                              skipped_nouse,
-                                             cl);
+                                             k, cl);
                }
                if (ob)
                        break;
index e29a0891134bca2f9538c489d59562a9e2a7fa7e..2eced20667fa585d7299bd7341b1467034b8f3d7 100644 (file)
@@ -494,11 +494,6 @@ struct bch_dev {
 
 enum {
        /* startup: */
-       BCH_FS_ALLOC_CLEAN,
-       BCH_FS_INITIAL_GC_DONE,
-       BCH_FS_INITIAL_GC_UNFIXED,
-       BCH_FS_TOPOLOGY_REPAIR_DONE,
-       BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
        BCH_FS_MAY_GO_RW,
        BCH_FS_RW,
@@ -508,17 +503,22 @@ enum {
        BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
+       BCH_FS_CLEAN_SHUTDOWN,
+
+       /* fsck passes: */
+       BCH_FS_TOPOLOGY_REPAIR_DONE,
+       BCH_FS_INITIAL_GC_DONE,         /* kill when we enumerate fsck passes */
+       BCH_FS_CHECK_LRUS_DONE,
+       BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE,
+       BCH_FS_FSCK_DONE,
+       BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
+       BCH_FS_NEED_ANOTHER_GC,
 
        /* errors: */
        BCH_FS_ERROR,
        BCH_FS_TOPOLOGY_ERROR,
        BCH_FS_ERRORS_FIXED,
        BCH_FS_ERRORS_NOT_FIXED,
-
-       /* misc: */
-       BCH_FS_NEED_ANOTHER_GC,
-       BCH_FS_DELETED_NODES,
-       BCH_FS_REBUILD_REPLICAS,
 };
 
 struct btree_debug {
@@ -585,6 +585,7 @@ struct bch_fs {
 
        struct list_head        list;
        struct kobject          kobj;
+       struct kobject          counters_kobj;
        struct kobject          internal;
        struct kobject          opts_dir;
        struct kobject          time_stats;
@@ -901,12 +902,15 @@ struct bch_fs {
 
        u64                     last_bucket_seq_cleanup;
 
-       /* The rest of this all shows up in sysfs */
+       /* TODO rewrite as counters - The rest of this all shows up in sysfs */
        atomic_long_t           read_realloc_races;
        atomic_long_t           extent_migrate_done;
        atomic_long_t           extent_migrate_raced;
        atomic_long_t           bucket_alloc_fail;
 
+       u64                     counters_on_mount[BCH_COUNTER_NR];
+       u64 __percpu            *counters;
+
        unsigned                btree_gc_periodic:1;
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
index cc279abfe0af933767f8598186587bb6018e1423..1bea79cf4f6aa35efdd318bf376e994b441bd3a9 100644 (file)
@@ -1086,7 +1086,8 @@ struct bch_sb_field {
        x(clean,        6)                      \
        x(replicas,     7)                      \
        x(journal_seq_blacklist, 8)             \
-       x(journal_v2,   9)
+       x(journal_v2,   9)                      \
+       x(counters,     10)
 
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -1319,6 +1320,25 @@ struct bch_sb_field_disk_groups {
        struct bch_disk_group   entries[0];
 } __attribute__((packed, aligned(8)));
 
+/* BCH_SB_FIELD_counters */
+
+#define BCH_PERSISTENT_COUNTERS()      \
+       x(io_read,  0)                  \
+       x(io_write, 1)                  \
+       x(io_move,  2)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+       struct bch_sb_field     field;
+       __le64                  d[0];
+};
+
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
index 488917752e0b1514fdb0ebf0eca45d9c30fa8034..db894b40d2ca4180e1e91f398cc3c7021fc68491 100644 (file)
@@ -27,8 +27,8 @@ struct bkey_ops {
        void            (*swab)(struct bkey_s);
        bool            (*key_normalize)(struct bch_fs *, struct bkey_s);
        bool            (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-       int             (*trans_trigger)(struct btree_trans *, struct bkey_s_c,
-                                        struct bkey_i *, unsigned);
+       int             (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
+                                        struct bkey_s_c, struct bkey_i *, unsigned);
        int             (*atomic_trigger)(struct btree_trans *, struct bkey_s_c,
                                          struct bkey_s_c, unsigned);
        void            (*compat)(enum btree_id id, unsigned version,
@@ -80,16 +80,80 @@ static inline int bch2_mark_key(struct btree_trans *trans,
                : 0;
 }
 
-static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
-                       struct bkey_i *new, unsigned flags)
+enum btree_update_flags {
+       __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
+
+       __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
+
+       __BTREE_TRIGGER_INSERT,
+       __BTREE_TRIGGER_OVERWRITE,
+
+       __BTREE_TRIGGER_GC,
+       __BTREE_TRIGGER_BUCKET_INVALIDATE,
+       __BTREE_TRIGGER_NOATOMIC,
+};
+
+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+
+#define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
+
+#define BTREE_TRIGGER_INSERT           (1U << __BTREE_TRIGGER_INSERT)
+#define BTREE_TRIGGER_OVERWRITE                (1U << __BTREE_TRIGGER_OVERWRITE)
+
+#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
+#define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
+#define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
+
+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
+       ((1U << KEY_TYPE_alloc)|                \
+        (1U << KEY_TYPE_alloc_v2)|             \
+        (1U << KEY_TYPE_alloc_v3)|             \
+        (1U << KEY_TYPE_alloc_v4)|             \
+        (1U << KEY_TYPE_stripe)|               \
+        (1U << KEY_TYPE_inode)|                \
+        (1U << KEY_TYPE_inode_v2)|             \
+        (1U << KEY_TYPE_snapshot))
+
+static inline int bch2_trans_mark_key(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_s_c old, struct bkey_i *new,
+                                     unsigned flags)
 {
        const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type];
 
        return ops->trans_trigger
-               ? ops->trans_trigger(trans, old, new, flags)
+               ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
                : 0;
 }
 
+static inline int bch2_trans_mark_old(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_s_c old, unsigned flags)
+{
+       struct bkey_i deleted;
+
+       bkey_init(&deleted.k);
+       deleted.k.p = old.k->p;
+
+       return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
+                                  BTREE_TRIGGER_OVERWRITE|flags);
+}
+
+static inline int bch2_trans_mark_new(struct btree_trans *trans,
+                                     enum btree_id btree_id, unsigned level,
+                                     struct bkey_i *new, unsigned flags)
+{
+       struct bkey_i deleted;
+
+       bkey_init(&deleted.k);
+       deleted.k.p = new->k.p;
+
+       return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+                                  BTREE_TRIGGER_INSERT|flags);
+}
+
 void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
 
 void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
index d027526d51f4fc57047c6654f5666cf24afb36dd..389b5a7bd7a27b989744d5e04a4fb63b7edabdea 100644 (file)
@@ -1745,18 +1745,14 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       u64 start_time = local_clock();
        unsigned iter = 0;
        int ret;
 
        lockdep_assert_held(&c->state_lock);
-       trace_gc_start(c);
 
        down_write(&c->gc_lock);
 
-       /* flush interior btree updates: */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        ret   = bch2_gc_start(c, metadata_only) ?:
                bch2_gc_alloc_start(c, metadata_only) ?:
@@ -1845,9 +1841,6 @@ out:
 
        up_write(&c->gc_lock);
 
-       trace_gc_end(c);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
        /*
         * At startup, allocations can happen directly instead of via the
         * allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1984,6 +1977,7 @@ int bch2_gc_gens(struct bch_fs *c)
        if (!mutex_trylock(&c->gc_gens_lock))
                return 0;
 
+       trace_gc_gens_start(c);
        down_read(&c->gc_lock);
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -2035,6 +2029,7 @@ int bch2_gc_gens(struct bch_fs *c)
        c->gc_count++;
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+       trace_gc_gens_end(c);
 err:
        for_each_member_device(ca, c, i) {
                kvfree(ca->oldest_gen);
index f28325443b0d5fe95e15ef0c6abd147ec334e72c..7a883922bb0914b4deff3abab20c79efadb79fe0 100644 (file)
@@ -820,10 +820,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                printbuf_reset(&buf);
                if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
                        printbuf_reset(&buf);
-                       pr_buf(&buf, "invalid bkey:\n  ");
-                       bch2_bkey_val_to_text(&buf, c, u.s_c);
-                       pr_buf(&buf, "  \n");
+                       pr_buf(&buf, "invalid bkey:  ");
                        bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
+                       pr_buf(&buf, "\n  ");
+                       bch2_bkey_val_to_text(&buf, c, u.s_c);
 
                        btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
@@ -1081,10 +1081,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                     !bversion_cmp(u.k->version, MAX_VERSION))) {
                        printbuf_reset(&buf);
 
-                       pr_buf(&buf, "invalid bkey\n  ");
-                       bch2_bkey_val_to_text(&buf, c, u.s_c);
-                       pr_buf(&buf, "\n  ");
+                       pr_buf(&buf, "invalid bkey: ");
                        bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
+                       pr_buf(&buf, "\n  ");
+                       bch2_bkey_val_to_text(&buf, c, u.s_c);
 
                        btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
 
@@ -2102,29 +2102,33 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        }
 }
 
-static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
 {
        struct bucket_table *tbl;
        struct rhash_head *pos;
        struct btree *b;
        unsigned i;
+       bool ret = false;
 restart:
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos)
                if (test_bit(flag, &b->flags)) {
                        rcu_read_unlock();
                        wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+                       ret = true;
                        goto restart;
                }
        rcu_read_unlock();
+
+       return ret;
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *c)
+bool bch2_btree_flush_all_reads(struct bch_fs *c)
 {
-       __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+       return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
 }
 
-void bch2_btree_flush_all_writes(struct bch_fs *c)
+bool bch2_btree_flush_all_writes(struct bch_fs *c)
 {
-       __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+       return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
 }
index d818d87661e863a78b19b047276094e9a84696a1..8af853642123df33276aad4cf1bad547001e7e6a 100644 (file)
@@ -152,8 +152,8 @@ static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
        bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
 }
 
-void bch2_btree_flush_all_reads(struct bch_fs *);
-void bch2_btree_flush_all_writes(struct bch_fs *);
+bool bch2_btree_flush_all_reads(struct bch_fs *);
+bool bch2_btree_flush_all_writes(struct bch_fs *);
 
 static inline void compat_bformat(unsigned level, enum btree_id btree_id,
                                  unsigned version, unsigned big_endian,
index 7f7bf13df6961767568e3d0c06c4de31a7c13c4f..e7541af9518f7b91d72380c2fdb89dca9aa1f1d7 100644 (file)
@@ -1527,6 +1527,30 @@ static inline bool btree_path_good_node(struct btree_trans *trans,
        return true;
 }
 
+static void btree_path_set_level_up(struct btree_path *path)
+{
+       btree_node_unlock(path, path->level);
+       path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+       path->level++;
+       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+}
+
+static void btree_path_set_level_down(struct btree_trans *trans,
+                                     struct btree_path *path,
+                                     unsigned new_level)
+{
+       unsigned l;
+
+       path->level = new_level;
+
+       for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+               if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+                       btree_node_unlock(path, l);
+
+       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+       bch2_btree_path_verify(trans, path);
+}
+
 static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
                                                     struct btree_path *path,
                                                     int check_pos)
@@ -2100,7 +2124,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        struct btree_trans *trans = iter->trans;
        struct btree_path *path = iter->path;
        struct btree *b = NULL;
-       unsigned l;
        int ret;
 
        BUG_ON(trans->restarted);
@@ -2113,10 +2136,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 
        /* got to end? */
        if (!btree_path_node(path, path->level + 1)) {
-               btree_node_unlock(path, path->level);
-               path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
-               path->level++;
-               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               btree_path_set_level_up(path);
                return NULL;
        }
 
@@ -2148,14 +2168,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
                                           iter->flags & BTREE_ITER_INTENT,
                                           btree_iter_ip_allocated(iter));
 
-               path->level = iter->min_depth;
-
-               for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
-                       if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
-                               btree_node_unlock(path, l);
-
-               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-               bch2_btree_iter_verify(iter);
+               btree_path_set_level_down(trans, path, iter->min_depth);
 
                ret = bch2_btree_path_traverse(trans, path, iter->flags);
                if (ret)
@@ -2186,15 +2199,23 @@ err:
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-       struct bpos pos = iter->k.p;
-       bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-                   ? bpos_cmp(pos, SPOS_MAX)
-                   : bkey_cmp(pos, SPOS_MAX)) != 0;
+       if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
+               struct bpos pos = iter->k.p;
+               bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                           ? bpos_cmp(pos, SPOS_MAX)
+                           : bkey_cmp(pos, SPOS_MAX)) != 0;
 
-       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_successor(iter, pos);
-       bch2_btree_iter_set_pos(iter, pos);
-       return ret;
+               if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+                       pos = bkey_successor(iter, pos);
+               bch2_btree_iter_set_pos(iter, pos);
+               return ret;
+       } else {
+               if (!btree_path_node(iter->path, iter->path->level))
+                       return true;
+
+               iter->advanced = true;
+               return false;
+       }
 }
 
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -2377,6 +2398,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
        struct bpos iter_pos;
        int ret;
 
+       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+
        if (iter->update_path) {
                bch2_path_put(trans, iter->update_path,
                              iter->flags & BTREE_ITER_INTENT);
@@ -2494,6 +2517,100 @@ out:
        return k;
 }
 
+/**
+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
+ * to iterator's current position, returning keys from every level of the btree.
+ * For keys at different levels of the btree that compare equal, the key from
+ * the lower level (leaf) is returned first.
+ */
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bkey_s_c k;
+       int ret;
+
+       EBUG_ON(iter->path->cached);
+       bch2_btree_iter_verify(iter);
+       BUG_ON(iter->path->level < iter->min_depth);
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+       EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
+
+       while (1) {
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+               if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
+                       k = bkey_s_c_err(ret);
+                       goto out;
+               }
+
+               /* Already at end? */
+               if (!btree_path_node(iter->path, iter->path->level)) {
+                       k = bkey_s_c_null;
+                       goto out;
+               }
+
+               k = btree_path_level_peek_all(trans->c,
+                               &iter->path->l[iter->path->level], &iter->k);
+
+               /* Check if we should go up to the parent node: */
+               if (!k.k ||
+                   (iter->advanced &&
+                    !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) {
+                       iter->pos = path_l(iter->path)->b->key.k.p;
+                       btree_path_set_level_up(iter->path);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               /*
+                * Check if we should go back down to a leaf:
+                * If we're not in a leaf node, we only return the current key
+                * if it exactly matches iter->pos - otherwise we first have to
+                * go back to the leaf:
+                */
+               if (iter->path->level != iter->min_depth &&
+                   (iter->advanced ||
+                    !k.k ||
+                    bpos_cmp(iter->pos, k.k->p))) {
+                       btree_path_set_level_down(trans, iter->path, iter->min_depth);
+                       iter->pos = bpos_successor(iter->pos);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               /* Check if we should go to the next key: */
+               if (iter->path->level == iter->min_depth &&
+                   iter->advanced &&
+                   k.k &&
+                   !bpos_cmp(iter->pos, k.k->p)) {
+                       iter->pos = bpos_successor(iter->pos);
+                       iter->advanced = false;
+                       continue;
+               }
+
+               if (iter->advanced &&
+                   iter->path->level == iter->min_depth &&
+                   bpos_cmp(k.k->p, iter->pos))
+                       iter->advanced = false;
+
+               BUG_ON(iter->advanced);
+               BUG_ON(!k.k);
+               break;
+       }
+
+       iter->pos = k.k->p;
+out:
+       iter->path->should_be_locked = true;
+       bch2_btree_iter_verify(iter);
+
+       return k;
+}
+
 /**
  * bch2_btree_iter_next: returns first key greater than iterator's current
  * position
@@ -2650,9 +2767,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(iter->path->level);
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
+       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
+       EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
        /* extents can't span inode numbers: */
        if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2687,7 +2805,9 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
                    (next_update = bch2_journal_keys_peek_slot(trans->c,
-                                       iter->btree_id, 0, iter->pos))) {
+                                       iter->btree_id,
+                                       iter->path->level,
+                                       iter->pos))) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
                        goto out;
@@ -2704,6 +2824,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        } else {
                struct bpos next;
 
+               EBUG_ON(iter->path->level);
+
                if (iter->flags & BTREE_ITER_INTENT) {
                        struct btree_iter iter2;
                        struct bpos end = iter->pos;
@@ -2802,6 +2924,9 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
        struct btree_path *path, *prev = NULL;
        unsigned i;
 
+       if (!bch2_debug_check_iterators)
+               return;
+
        trans_for_each_path_inorder(trans, path, i) {
                if (prev && btree_path_cmp(prev, path) > 0) {
                        bch2_dump_trans_paths_updates(trans);
@@ -2919,6 +3044,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
 {
        EBUG_ON(trans->restarted);
 
+       if (flags & BTREE_ITER_ALL_LEVELS)
+               flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
+
        if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
            btree_node_type_is_extents(btree_id))
                flags |= BTREE_ITER_IS_EXTENTS;
@@ -2934,12 +3062,6 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
        if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
                flags |= BTREE_ITER_WITH_JOURNAL;
 
-       if (!btree_id_cached(trans->c, btree_id)) {
-               flags &= ~BTREE_ITER_CACHED;
-               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
-       } else if (!(flags & BTREE_ITER_CACHED))
-               flags |= BTREE_ITER_WITH_KEY_CACHE;
-
        iter->trans     = trans;
        iter->path      = NULL;
        iter->update_path = NULL;
@@ -2965,6 +3087,12 @@ void bch2_trans_iter_init(struct btree_trans *trans,
                          unsigned btree_id, struct bpos pos,
                          unsigned flags)
 {
+       if (!btree_id_cached(trans->c, btree_id)) {
+               flags &= ~BTREE_ITER_CACHED;
+               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+       } else if (!(flags & BTREE_ITER_CACHED))
+               flags |= BTREE_ITER_WITH_KEY_CACHE;
+
        __bch2_trans_iter_init(trans, iter, btree_id, pos,
                               0, 0, flags, _RET_IP_);
 }
index f6700295e1a7af5690b4987983c345236b7112c7..dad05ea003570402ea582e7887529e09ec8dc53b 100644 (file)
@@ -212,6 +212,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
+
 static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
        return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -313,9 +315,9 @@ static inline int bkey_err(struct bkey_s_c k)
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
                                                        unsigned flags)
 {
-       return flags & BTREE_ITER_SLOTS
-               ? bch2_btree_iter_peek_slot(iter)
-               : bch2_btree_iter_peek(iter);
+       return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
+               flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+                                               bch2_btree_iter_peek(iter);
 }
 
 static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
index ab394c2d6ef21317845ad49c1f0c7223d5394f2e..a575189f358c9146308d3d93a9b7e2bf40222055 100644 (file)
@@ -236,6 +236,13 @@ static int btree_key_cache_fill(struct btree_trans *trans,
         */
        new_u64s = k.k->u64s + 1;
 
+       /*
+        * Allocate some extra space so that the transaction commit path is less
+        * likely to have to reallocate, since that requires a transaction
+        * restart:
+        */
+       new_u64s = min(256U, (new_u64s * 3) / 2);
+
        if (new_u64s > ck->u64s) {
                new_u64s = roundup_pow_of_two(new_u64s);
                new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
index 3438e089dba0edc81c5ec808fa688ce0d187588e..e4ed46a4f870847019fa9346a3b853673bd7c6d5 100644 (file)
@@ -182,22 +182,16 @@ struct btree_node_iter {
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
 #define BTREE_ITER_SLOTS               (1 << 0)
+#define BTREE_ITER_ALL_LEVELS          (1 << 1)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT              (1 << 1)
+#define BTREE_ITER_INTENT              (1 << 2)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH            (1 << 2)
-/*
- * Indicates that this iterator should not be reused until transaction commit,
- * either because a pending update references it or because the update depends
- * on that particular key being locked (e.g. by the str_hash code, for hash
- * table consistency)
- */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 3)
+#define BTREE_ITER_PREFETCH            (1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
@@ -282,7 +276,8 @@ struct btree_iter {
        struct btree_path       *key_cache_path;
 
        enum btree_id           btree_id:4;
-       unsigned                min_depth:4;
+       unsigned                min_depth:3;
+       unsigned                advanced:1;
 
        /* btree_iter_copy starts here: */
        u16                     flags;
@@ -639,42 +634,6 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
        return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
 }
 
-enum btree_update_flags {
-       __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
-       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
-
-       __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
-
-       __BTREE_TRIGGER_INSERT,
-       __BTREE_TRIGGER_OVERWRITE,
-
-       __BTREE_TRIGGER_GC,
-       __BTREE_TRIGGER_BUCKET_INVALIDATE,
-       __BTREE_TRIGGER_NOATOMIC,
-};
-
-#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
-#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
-
-#define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
-
-#define BTREE_TRIGGER_INSERT           (1U << __BTREE_TRIGGER_INSERT)
-#define BTREE_TRIGGER_OVERWRITE                (1U << __BTREE_TRIGGER_OVERWRITE)
-
-#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
-#define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
-
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_alloc)|                \
-        (1U << KEY_TYPE_alloc_v2)|             \
-        (1U << KEY_TYPE_alloc_v3)|             \
-        (1U << KEY_TYPE_alloc_v4)|             \
-        (1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode)|                \
-        (1U << KEY_TYPE_inode_v2)|             \
-        (1U << KEY_TYPE_snapshot))
-
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
        return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
index b1aa77b8f8b9990ff0514c99e2eb2738e7c8eca0..686e51dfb487df2f4e189a0ae6d8f78ecf081f5a 100644 (file)
@@ -381,16 +381,13 @@ static void bch2_btree_reserve_put(struct btree_update *as)
        struct bch_fs *c = as->c;
        struct prealloc_nodes *p;
 
-       mutex_lock(&c->btree_reserve_cache_lock);
-
        for (p = as->prealloc_nodes;
             p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
             p++) {
                while (p->nr) {
                        struct btree *b = p->b[--p->nr];
 
-                       six_lock_intent(&b->c.lock, NULL, NULL);
-                       six_lock_write(&b->c.lock, NULL, NULL);
+                       mutex_lock(&c->btree_reserve_cache_lock);
 
                        if (c->btree_reserve_cache_nr <
                            ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -404,13 +401,15 @@ static void bch2_btree_reserve_put(struct btree_update *as)
                                bch2_open_buckets_put(c, &b->ob);
                        }
 
+                       mutex_unlock(&c->btree_reserve_cache_lock);
+
+                       six_lock_intent(&b->c.lock, NULL, NULL);
+                       six_lock_write(&b->c.lock, NULL, NULL);
                        __btree_node_free(c, b);
                        six_unlock_write(&b->c.lock);
                        six_unlock_intent(&b->c.lock);
                }
        }
-
-       mutex_unlock(&c->btree_reserve_cache_lock);
 }
 
 static int bch2_btree_reserve_get(struct btree_update *as,
@@ -506,20 +505,18 @@ static void bch2_btree_update_free(struct btree_update *as)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_will_delete_key(struct btree_update *as,
-                                        struct bkey_i *k)
+static void btree_update_add_key(struct btree_update *as,
+                                struct keylist *keys, struct btree *b)
 {
-       BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
+       struct bkey_i *k = &b->key;
+
+       BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
               ARRAY_SIZE(as->_old_keys));
-       bch2_keylist_add(&as->old_keys, k);
-}
 
-static void btree_update_will_add_key(struct btree_update *as,
-                                     struct bkey_i *k)
-{
-       BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
-              ARRAY_SIZE(as->_new_keys));
-       bch2_keylist_add(&as->new_keys, k);
+       bkey_copy(keys->top, k);
+       bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;
+
+       bch2_keylist_push(keys);
 }
 
 /*
@@ -532,7 +529,7 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
        struct bkey_i *k;
        int ret;
 
-       ret = darray_make_room(trans->extra_journal_entries, as->journal_u64s);
+       ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
        if (ret)
                return ret;
 
@@ -543,14 +540,18 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
 
        trans->journal_pin = &as->journal;
 
-       for_each_keylist_key(&as->new_keys, k) {
-               ret = bch2_trans_mark_new(trans, k, 0);
+       for_each_keylist_key(&as->old_keys, k) {
+               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+               ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
                if (ret)
                        return ret;
        }
 
-       for_each_keylist_key(&as->old_keys, k) {
-               ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(k), 0);
+       for_each_keylist_key(&as->new_keys, k) {
+               unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
+
+               ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
                if (ret)
                        return ret;
        }
@@ -653,8 +654,8 @@ err:
 
                        if (!ret) {
                                i->journal_seq = cpu_to_le64(
-                                       max(journal_seq,
-                                           le64_to_cpu(i->journal_seq)));
+                                                            max(journal_seq,
+                                                                le64_to_cpu(i->journal_seq)));
 
                                bch2_btree_add_journal_pin(c, b, journal_seq);
                        } else {
@@ -822,7 +823,7 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree
 
        mutex_unlock(&c->btree_interior_update_lock);
 
-       btree_update_will_add_key(as, &b->key);
+       btree_update_add_key(as, &as->new_keys, b);
 }
 
 /*
@@ -875,7 +876,7 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
  * btree_updates to point to this btree_update:
  */
 static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
-                                              struct btree *b)
+                                                     struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree_update *p, *n;
@@ -939,7 +940,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         */
        btree_update_drop_new_node(c, b);
 
-       btree_update_will_delete_key(as, &b->key);
+       btree_update_add_key(as, &as->old_keys, b);
 
        as->old_nodes[as->nr_old_nodes] = b;
        as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
@@ -1095,11 +1096,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        list_del_init(&b->list);
        mutex_unlock(&c->btree_cache.lock);
 
-       if (b->c.level)
-               six_lock_pcpu_alloc(&b->c.lock);
-       else
-               six_lock_pcpu_free(&b->c.lock);
-
        mutex_lock(&c->btree_root_lock);
        BUG_ON(btree_node_root(c, b) &&
               (b->c.level < btree_node_root(c, b)->c.level ||
@@ -1249,13 +1245,14 @@ static struct btree *__btree_split_node(struct btree_update *as,
        struct bpos n1_pos;
 
        n2 = bch2_btree_node_alloc(as, n1->c.level);
-       bch2_btree_update_add_new_node(as, n2);
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
        SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
        n2->key.k.p = n1->key.k.p;
 
+       bch2_btree_update_add_new_node(as, n2);
+
        set1 = btree_bset_first(n1);
        set2 = btree_bset_first(n2);
 
@@ -1412,7 +1409,6 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
        bch2_btree_interior_update_will_free_node(as, b);
 
        n1 = bch2_btree_node_alloc_replacement(as, b);
-       bch2_btree_update_add_new_node(as, n1);
 
        if (keys)
                btree_split_insert_keys(as, trans, path, n1, keys);
@@ -1427,6 +1423,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                six_unlock_write(&n2->c.lock);
                six_unlock_write(&n1->c.lock);
 
+               bch2_btree_update_add_new_node(as, n1);
+
                bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
                bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
 
@@ -1455,6 +1453,8 @@ static void btree_split(struct btree_update *as, struct btree_trans *trans,
                bch2_btree_build_aux_trees(n1);
                six_unlock_write(&n1->c.lock);
 
+               bch2_btree_update_add_new_node(as, n1);
+
                bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
 
                if (parent)
@@ -1723,7 +1723,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        bch2_btree_interior_update_will_free_node(as, m);
 
        n = bch2_btree_node_alloc(as, b->c.level);
-       bch2_btree_update_add_new_node(as, n);
 
        SET_BTREE_NODE_SEQ(n->data,
                           max(BTREE_NODE_SEQ(b->data),
@@ -1731,8 +1730,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        btree_set_min(n, prev->data->min_key);
        btree_set_max(n, next->data->max_key);
-       n->data->format         = new_f;
 
+       bch2_btree_update_add_new_node(as, n);
+
+       n->data->format  = new_f;
        btree_node_set_format(n, new_f);
 
        bch2_btree_sort_into(c, n, prev);
@@ -1797,10 +1798,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
        as = bch2_btree_update_start(trans, iter->path, b->c.level,
                                     false, flags);
        ret = PTR_ERR_OR_ZERO(as);
-       if (ret) {
-               trace_btree_gc_rewrite_node_fail(c, b);
+       if (ret)
                goto out;
-       }
 
        bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1810,7 +1809,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
        bch2_btree_build_aux_trees(n);
        six_unlock_write(&n->c.lock);
 
-       trace_btree_gc_rewrite_node(c, b);
+       trace_btree_rewrite(c, b);
 
        bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
@@ -1915,11 +1914,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        int ret;
 
        if (!skip_triggers) {
-               ret = bch2_trans_mark_new(trans, new_key, 0);
+               ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
+                                         bkey_i_to_s_c(&b->key), 0);
                if (ret)
                        return ret;
 
-               ret = bch2_trans_mark_old(trans, bkey_i_to_s_c(&b->key), 0);
+               ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
+                                         new_key, 0);
                if (ret)
                        return ret;
        }
@@ -1956,7 +1957,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        } else {
                BUG_ON(btree_node_root(c, b) != b);
 
-               ret = darray_make_room(trans->extra_journal_entries,
+               ret = darray_make_room(&trans->extra_journal_entries,
                                       jset_u64s(new_key->k.u64s));
                if (ret)
                        return ret;
@@ -2158,19 +2159,27 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
 {
-       size_t ret = 0;
-       struct list_head *i;
+       bool ret;
 
        mutex_lock(&c->btree_interior_update_lock);
-       list_for_each(i, &c->btree_interior_update_list)
-               ret++;
+       ret = !list_empty(&c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
 
        return ret;
 }
 
+bool bch2_btree_interior_updates_flush(struct bch_fs *c)
+{
+       bool ret = bch2_btree_interior_updates_pending(c);
+
+       if (ret)
+               closure_wait_event(&c->btree_interior_update_wait,
+                                  !bch2_btree_interior_updates_pending(c));
+       return ret;
+}
+
 void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
 {
        struct btree_root *r;
index e72eb87956167d881bf3997d136ee0fef4d87d14..adfc6c24a7a402f3eeb88db33394909213d08fc2 100644 (file)
@@ -309,7 +309,7 @@ static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
 
 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
 
-size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
index ef90f7a3d8dc2e3f3c0fc0b2749fbe62ffad1e1c..58bb687a3a8fe8c5f892b3f2ca598efcf7040092 100644 (file)
@@ -478,16 +478,16 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
                i->overwrite_trigger_run = true;
                i->insert_trigger_run = true;
-               return bch2_trans_mark_key(trans, old, i->k,
+               return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
                                           BTREE_TRIGGER_INSERT|
                                           BTREE_TRIGGER_OVERWRITE|
                                           i->flags) ?: 1;
        } else if (overwrite && !i->overwrite_trigger_run) {
                i->overwrite_trigger_run = true;
-               return bch2_trans_mark_old(trans, old, i->flags) ?: 1;
+               return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
        } else if (!overwrite && !i->insert_trigger_run) {
                i->insert_trigger_run = true;
-               return bch2_trans_mark_new(trans, i->k, i->flags) ?: 1;
+               return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
        } else {
                return 0;
        }
@@ -1111,6 +1111,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
                        goto out_reset;
        }
 
+       EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
        memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
        trans->journal_u64s             = trans->extra_journal_entries.nr;
@@ -1159,6 +1161,8 @@ retry:
 
        if (ret)
                goto err;
+
+       trace_transaction_commit(trans->fn, _RET_IP_);
 out:
        bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
@@ -1753,7 +1757,7 @@ int bch2_trans_log_msg(struct btree_trans *trans, const char *msg)
        struct jset_entry_log *l;
        int ret;
 
-       ret = darray_make_room(trans->extra_journal_entries, jset_u64s(u64s));
+       ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s));
        if (ret)
                return ret;
 
index 8202bf12ac24a944ac7cb2994588f18cf3b18d20..c5c904d9c29b77d40010c723b16e1d41fe6ee3d1 100644 (file)
@@ -378,10 +378,9 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
 
        idx = bch2_replicas_entry_idx(c, r);
        if (idx < 0 &&
-           (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-            fsck_err(c, "no replicas entry\n"
-                     "  while marking %s",
-                     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+           fsck_err(c, "no replicas entry\n"
+                    "  while marking %s",
+                    (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                percpu_up_read(&c->mark_lock);
                ret = bch2_mark_replicas(c, r);
                percpu_down_read(&c->mark_lock);
@@ -596,9 +595,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
                        bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
                        return ret;
                }
-
-               trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
-                                old_a.cached_sectors);
        }
 
        return 0;
@@ -1447,6 +1443,7 @@ err:
 }
 
 int bch2_trans_mark_extent(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
                           struct bkey_s_c old, struct bkey_i *new,
                           unsigned flags)
 {
@@ -1585,6 +1582,7 @@ err:
 }
 
 int bch2_trans_mark_stripe(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
                           struct bkey_s_c old, struct bkey_i *new,
                           unsigned flags)
 {
@@ -1655,6 +1653,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_inode(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
                          struct bkey_s_c old,
                          struct bkey_i *new,
                          unsigned flags)
@@ -1671,6 +1670,7 @@ int bch2_trans_mark_inode(struct btree_trans *trans,
 }
 
 int bch2_trans_mark_reservation(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
                                struct bkey_s_c old,
                                struct bkey_i *new,
                                unsigned flags)
@@ -1772,6 +1772,7 @@ err:
 }
 
 int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old,
                              struct bkey_i *new,
                              unsigned flags)
index 8f360b37927dc5d804ee934f1a43886e6c455072..3469327d6c9d73724df8b6a8c85abc3a363a1e42 100644 (file)
@@ -202,41 +202,14 @@ int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsi
 int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_trans_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
 int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
-                       struct bkey_i *, unsigned);
-
-static inline int bch2_trans_mark_old(struct btree_trans *trans,
-                                     struct bkey_s_c old, unsigned flags)
-{
-       struct bkey_i deleted;
-
-       bkey_init(&deleted.k);
-       deleted.k.p = old.k->p;
-
-       return bch2_trans_mark_key(trans, old, &deleted,
-                                  BTREE_TRIGGER_OVERWRITE|flags);
-}
-
-static inline int bch2_trans_mark_new(struct btree_trans *trans,
-                                     struct bkey_i *new, unsigned flags)
-{
-       struct bkey_i deleted;
-
-       bkey_init(&deleted.k);
-       deleted.k.p = new->k.p;
-
-       return bch2_trans_mark_key(trans, bkey_i_to_s_c(&deleted), new,
-                                  BTREE_TRIGGER_INSERT|flags);
-}
-
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c
new file mode 100644 (file)
index 0000000..25a6b38
--- /dev/null
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "super-io.h"
+#include "counters.h"
+
+/* BCH_SB_FIELD_counters */
+
+const char * const bch2_counter_names[] = {
+#define x(t, n, ...) (#t),
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       NULL
+};
+
+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
+{
+       if (!ctrs)
+               return 0;
+
+       return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
+};
+
+static int bch2_sb_counters_validate(struct bch_sb *sb,
+                                    struct bch_sb_field *f,
+                                    struct printbuf *err)
+{
+       return 0;
+};
+
+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
+                             struct bch_sb_field *f)
+{
+       struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+       for (i = 0; i < nr; i++) {
+               if (i < BCH_COUNTER_NR)
+                       pr_buf(out, "%s", bch2_counter_names[i]);
+               else
+                       pr_buf(out, "(unknown)");
+
+               pr_tab(out);
+               pr_buf(out, "%llu", le64_to_cpu(ctrs->d[i]));
+               pr_newline(out);
+       };
+};
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+       u64 val = 0;
+
+       for (i = 0; i < BCH_COUNTER_NR; i++)
+               c->counters_on_mount[i] = 0;
+
+       for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
+               val = le64_to_cpu(ctrs->d[i]);
+               percpu_u64_set(&c->counters[i], val);
+               c->counters_on_mount[i] = val;
+       }
+       return 0;
+};
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       struct bch_sb_field_counters *ret;
+       unsigned int i;
+       unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
+
+       if (nr < BCH_COUNTER_NR) {
+               ret = bch2_sb_resize_counters(&c->disk_sb,
+                                              sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
+
+               if (ret) {
+                       ctrs = ret;
+                       nr = bch2_sb_counter_nr_entries(ctrs);
+               }
+       }
+
+
+       for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
+               ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+       return 0;
+}
+
+int bch2_fs_counters_init(struct bch_fs *c)
+{
+       int ret = 0;
+
+       c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64));
+
+       if (!c->counters)
+               return -ENOMEM;
+
+       ret = bch2_sb_counters_to_cpu(c);
+
+       return ret;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_counters = {
+       .validate       = bch2_sb_counters_validate,
+       .to_text        = bch2_sb_counters_to_text,
+};
diff --git a/libbcachefs/counters.h b/libbcachefs/counters.h
new file mode 100644 (file)
index 0000000..1f3207a
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COUNTERS_H
+#define _BCACHEFS_COUNTERS_H
+
+#include "bcachefs.h"
+#include "super-io.h"
+
+
+int bch2_sb_counters_to_cpu(struct bch_fs *c);
+
+int bch2_sb_counters_from_cpu(struct bch_fs *c);
+
+int bch2_fs_counters_init(struct bch_fs *c);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+
+#endif // _BCACHEFS_COUNTERS_H
index 745b1cdb0d1711ecff934d03e1274728ceeb8fa2..049e1d1e13427dd2811f842bbd9a3a376fd8de25 100644 (file)
@@ -36,7 +36,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
 }
 
 #define darray_make_room(_d, _more)                                    \
-       __darray_make_room((darray_void *) &(_d), sizeof((_d).data[0]), (_more))
+       __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more))
 
 #define darray_top(_d)         ((_d).data[(_d).nr])
 
@@ -45,7 +45,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
        int _ret = darray_make_room((_d), 1);                           \
                                                                        \
        if (!_ret)                                                      \
-               (_d).data[(_d).nr++] = (_item);                         \
+               (_d)->data[(_d)->nr++] = (_item);                       \
        _ret;                                                           \
 })
 
@@ -54,7 +54,7 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
        int _ret = darray_make_room((_d), 1);                           \
                                                                        \
        if (!_ret)                                                      \
-               array_insert_item((_d).data, (_d).nr, (_pos), (_item)); \
+               array_insert_item((_d)->data, (_d)->nr, (_pos), (_item));\
        _ret;                                                           \
 })
 
@@ -63,13 +63,13 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more)
 
 #define darray_init(_d)                                                        \
 do {                                                                   \
-       (_d).data = NULL;                                               \
-       (_d).nr = (_d).size = 0;                                        \
+       (_d)->data = NULL;                                              \
+       (_d)->nr = (_d)->size = 0;                                      \
 } while (0)
 
 #define darray_exit(_d)                                                        \
 do {                                                                   \
-       kfree((_d).data);                                               \
+       kfree((_d)->data);                                              \
        darray_init(_d);                                                \
 } while (0)
 
index 2d65ae370931cb3ac5fe0641a7dc5ab67d405155..3b869be1850680ccb0b60e34e524b09b0c6d026f 100644 (file)
@@ -443,6 +443,11 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
        bch2_flags_to_text(out, bch2_btree_node_flags, b->flags);
        pr_newline(out);
 
+       pr_buf(out, "pcpu read locks: ");
+       pr_tab(out);
+       pr_buf(out, "%u", b->c.lock.readers != NULL);
+       pr_newline(out);
+
        pr_buf(out, "written:");
        pr_tab(out);
        pr_buf(out, "%u", b->written);
index dffbcffa923d471c9eae96b632ebfcf09d10caf4..2e541a4f55ac73d098ea74c90776d47b726d0c94 100644 (file)
@@ -308,8 +308,20 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                            lp.crc.uncompressed_size +
                            rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
                                return false;
+               }
+
+               en_l = extent_entry_next(en_l);
+               en_r = extent_entry_next(en_r);
+       }
+
+       en_l = l_ptrs.start;
+       en_r = r_ptrs.start;
+       while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+               if (extent_entry_is_crc(en_l)) {
+                       struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+                       struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
 
-                       if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+                       if (crc_l.uncompressed_size + crc_r.uncompressed_size >
                            bch2_crc_field_size_max[extent_entry_type(en_l)])
                                return false;
                }
index 05a0246722a9b18098e146a8d41cf15614f65a56..4cb2b2eb4dbe541da8d53bbf8c190241e7d0c978 100644 (file)
@@ -232,7 +232,10 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
                return;
 
        mutex_lock(&inode->ei_quota_lock);
-       BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+                               inode->ei_inode.bi_sectors);
        inode->v.i_blocks += sectors;
 
 #ifdef CONFIG_BCACHEFS_QUOTA
@@ -2710,9 +2713,11 @@ int bch2_truncate(struct user_namespace *mnt_userns,
                        U64_MAX, &i_sectors_delta);
        i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
-               !bch2_journal_error(&c->journal));
-
+       bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
+                               !bch2_journal_error(&c->journal), c,
+                               "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks,
+                               inode->ei_inode.bi_sectors);
        if (unlikely(ret))
                goto err;
 
index 963834d074d94d26395b9e218f296c3dbd7c59ac..f1abec95a740cb03e0b9facd37512677206d4e23 100644 (file)
@@ -560,7 +560,7 @@ struct inode_walker {
 
 static void inode_walker_exit(struct inode_walker *w)
 {
-       darray_exit(w->inodes);
+       darray_exit(&w->inodes);
 }
 
 static struct inode_walker inode_walker_init(void)
@@ -575,7 +575,7 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w,
 
        BUG_ON(bch2_inode_unpack(inode, &u));
 
-       return darray_push(w->inodes, ((struct inode_walker_entry) {
+       return darray_push(&w->inodes, ((struct inode_walker_entry) {
                .inode          = u,
                .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
        }));
@@ -628,7 +628,7 @@ found:
                while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
                        --i;
 
-               ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]);
+               ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]);
                if (ret)
                        return ret;
 
@@ -740,8 +740,9 @@ static int hash_check_key(struct btree_trans *trans,
        if (hash_k.k->p.offset < hash)
                goto bad_hash;
 
-       for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
-                          BTREE_ITER_SLOTS, k, ret) {
+       for_each_btree_key_norestart(trans, iter, desc.btree_id,
+                                    POS(hash_k.k->p.inode, hash),
+                                    BTREE_ITER_SLOTS, k, ret) {
                if (!bkey_cmp(k.k->p, hash_k.k->p))
                        break;
 
@@ -759,16 +760,15 @@ static int hash_check_key(struct btree_trans *trans,
                        bch2_trans_iter_exit(trans, &iter);
                        goto bad_hash;
                }
-
        }
 out:
        bch2_trans_iter_exit(trans, &iter);
        printbuf_exit(&buf);
        return ret;
 bad_hash:
-       if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
+       if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, "
                     "hashed to %llu\n%s",
-                    desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
+                    bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
                     (printbuf_reset(&buf),
                      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
                return 0;
@@ -1405,8 +1405,8 @@ static int check_dirent_target(struct btree_trans *trans,
 
                if (fsck_err_on(backpointer_exists &&
                                !target->bi_nlink, c,
-                               "inode %llu has multiple links but i_nlink 0",
-                               target->bi_inum)) {
+                               "inode %llu type %s has multiple links but i_nlink 0",
+                               target->bi_inum, bch2_d_types[d.v->d_type])) {
                        target->bi_nlink++;
                        target->bi_flags &= ~BCH_INODE_UNLINKED;
 
@@ -1879,7 +1879,7 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 static int path_down(struct bch_fs *c, pathbuf *p,
                     u64 inum, u32 snapshot)
 {
-       int ret = darray_push(*p, ((struct pathbuf_entry) {
+       int ret = darray_push(p, ((struct pathbuf_entry) {
                .inum           = inum,
                .snapshot       = snapshot,
        }));
@@ -2037,7 +2037,7 @@ static int check_directory_structure(struct bch_fs *c)
 
        BUG_ON(ret == -EINTR);
 
-       darray_exit(path);
+       darray_exit(&path);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -2254,8 +2254,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                }
 
                if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
-                               "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
-                               u.bi_inum, mode_to_type(u.bi_mode),
+                               "inode %llu type %s has wrong i_nlink (%u, should be %u)",
+                               u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
                                bch2_inode_nlink_get(&u), link->count)) {
                        bch2_inode_nlink_set(&u, link->count);
 
index 223344e1ad74c303802fd1858c913ead1f8de038..1ad4c7d77812998a17d4fec011ce645176d49d81 100644 (file)
@@ -1288,6 +1288,7 @@ void bch2_write(struct closure *cl)
                goto err;
        }
 
+       this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
        bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
        data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -2200,6 +2201,7 @@ get_bio:
        if (rbio->bounce)
                trace_read_bounce(&rbio->bio);
 
+       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
        bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
        /*
index 75be0a5f708c089e496e123d7f67b4fd4879eff7..4c5b67599007a3bbaa252b62319ea81dd57b145a 100644 (file)
@@ -792,8 +792,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
        int ret = 0;
 
        if (c) {
-               bch2_journal_block(&c->journal);
                bch2_journal_flush_all_pins(&c->journal);
+               bch2_journal_block(&c->journal);
        }
 
        bu              = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
index 7fee0c05aa7d72069b49ff6a3872ec04d8ac33e8..59453dcfa4e937137404e7be0452a7e30114356c 100644 (file)
@@ -146,8 +146,6 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
        return j->seq_ondisk + 1;
 }
 
-void bch2_journal_set_has_inum(struct journal *, u64, u64);
-
 static inline int journal_state_count(union journal_res_state s, int idx)
 {
        switch (idx) {
index 7c0aed9d92f3fe5bee35fbe065e89fb2d432d04d..e537a578c44316a1dd9cb7dc72b4604f5386cc7f 100644 (file)
@@ -1055,7 +1055,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
        jlist.ret = 0;
 
        for_each_member_device(ca, c, iter) {
-               if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+               if (!c->opts.fsck &&
                    !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
                        continue;
 
@@ -1212,10 +1212,9 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
                bch2_replicas_entry_to_text(&buf, &replicas.e);
 
                if (!degraded &&
-                   (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-                                "superblock not marked as containing replicas %s",
-                                buf.buf))) {
+                   fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+                               "superblock not marked as containing replicas %s",
+                               buf.buf)) {
                        ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
                                goto err;
@@ -1442,7 +1441,8 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       journal_reclaim_kick(&c->journal);
+       if (j->watermark)
+               journal_reclaim_kick(&c->journal);
 
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
index a9f7d5a7feb2a8811a406d42974c47305c479721..fdc94e831a86d63b6d0c07fcc3b0eee8e72fe288 100644 (file)
@@ -589,7 +589,7 @@ static u64 journal_seq_to_flush(struct journal *j)
  * 512 journal entries or 25% of all journal buckets, then
  * journal_next_bucket() should not stall.
  */
-static int __bch2_journal_reclaim(struct journal *j, bool direct)
+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        bool kthread = (current->flags & PF_KTHREAD) != 0;
@@ -638,8 +638,10 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
                        min_nr = 1;
 
-               trace_journal_reclaim_start(c,
-                               min_nr,
+               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
+
+               trace_journal_reclaim_start(c, direct, kicked,
+                               min_nr, min_key_cache,
                                j->prereserved.reserved,
                                j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
@@ -647,8 +649,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
                                atomic_long_read(&c->btree_key_cache.nr_keys));
 
-               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
-
                nr_flushed = journal_flush_pins(j, seq_to_flush,
                                                min_nr, min_key_cache);
 
@@ -669,7 +669,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 
 int bch2_journal_reclaim(struct journal *j)
 {
-       return __bch2_journal_reclaim(j, true);
+       return __bch2_journal_reclaim(j, true, true);
 }
 
 static int bch2_journal_reclaim_thread(void *arg)
@@ -685,10 +685,12 @@ static int bch2_journal_reclaim_thread(void *arg)
        j->last_flushed = jiffies;
 
        while (!ret && !kthread_should_stop()) {
+               bool kicked = j->reclaim_kicked;
+
                j->reclaim_kicked = false;
 
                mutex_lock(&j->reclaim_lock);
-               ret = __bch2_journal_reclaim(j, false);
+               ret = __bch2_journal_reclaim(j, false, kicked);
                mutex_unlock(&j->reclaim_lock);
 
                now = jiffies;
index 506044e358db8b2e805ee11f3b9794243db13dbe..6d984313d4b58ce529be56b5d8f9c07f2ae7b93e 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "journal_sb.h"
+#include "darray.h"
 
 #include <linux/sort.h>
 
@@ -142,12 +143,6 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
        }
 
        for (i = 0; i + 1 < nr; i++) {
-               if (b[i].end == b[i + 1].start) {
-                       pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu",
-                              b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
-                       goto err;
-               }
-
                if (b[i].end > b[i + 1].start) {
                        pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
                               b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
@@ -219,5 +214,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
                }
        }
 
+       BUG_ON(dst + 1 != nr);
+
        return 0;
 }
index fe9d1574294794333ddbb52fa0c83a910f9fe3c0..ce23b38382f5270a94cb93f1ef01971fab4ffb92 100644 (file)
@@ -204,7 +204,9 @@ int bch2_check_lrus(struct bch_fs *c, bool initial)
 
        for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
                        bch2_check_lru_key(&trans, &iter, initial));
                if (ret)
                        break;
index 6defc33322b3b24bd5f9a076165accce6f204aa0..5345697f2712cebdf44a59022cb3d93be1edf2c5 100644 (file)
@@ -175,10 +175,7 @@ next:
                        goto err;
        }
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-
+       bch2_btree_interior_updates_flush(c);
        ret = 0;
 err:
        bch2_trans_exit(&trans);
index 1de213506adf738069a99b64b421fbc85b5b13b3..f1fb2ab513dfc3080ef6b9bb3da327ea58292812 100644 (file)
@@ -125,7 +125,7 @@ next:
                }
        }
        bch2_trans_iter_exit(trans, &iter);
-       darray_exit(s.ids);
+       darray_exit(&s.ids);
 
        return ret;
 }
@@ -574,6 +574,7 @@ static int bch2_move_extent(struct btree_trans *trans,
 
        atomic64_inc(&ctxt->stats->keys_moved);
        atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+       this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
 
        trace_move_extent(k.k);
 
@@ -596,7 +597,7 @@ err_free_pages:
 err_free:
        kfree(io);
 err:
-       trace_move_alloc_fail(k.k);
+       trace_move_alloc_mem_fail(k.k);
        return ret;
 }
 
@@ -941,9 +942,7 @@ next:
        if (ret)
                bch_err(c, "error %i in bch2_move_btree", ret);
 
-       /* flush relevant btree updates */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        progress_list_del(c, stats);
        return ret;
index 8bc67d07afb93c9f6aa1b35b11237769ae19d90d..85f029602eb5f41c61660b73b3f34f1643533fba 100644 (file)
@@ -316,11 +316,6 @@ enum opt_type {
          OPT_BOOL(),                                                   \
          BCH2_NO_SB_OPT,                       false,                          \
          NULL,         "Don't replay the journal")                     \
-       x(rebuild_replicas,             u8,                             \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,                       false,                          \
-         NULL,         "Rebuild the superblock replicas section")      \
        x(keep_journal,                 u8,                             \
          0,                                                            \
          OPT_BOOL(),                                                   \
index a9b4ad17705175ce171cefc9facd6f59fc1ef0bf..ff483ff303da964c97f6c793b98f69a41ab9ffce 100644 (file)
@@ -147,7 +147,7 @@ static void journal_iters_fix(struct bch_fs *c)
 
        /*
         * If an iterator points one after the key we just inserted,
-        * and the key we just inserted compares >= the iterator's position,
+        * and the key we just inserted compares > the iterator's position,
         * decrement the iterator so it points at the key we just inserted:
         */
        list_for_each_entry(iter, &c->journal_iters, journal.list)
@@ -155,7 +155,7 @@ static void journal_iters_fix(struct bch_fs *c)
                    iter->last &&
                    iter->b->c.btree_id == n->btree_id &&
                    iter->b->c.level    == n->level &&
-                   bpos_cmp(n->k->k.p, iter->unpacked.p) >= 0)
+                   bpos_cmp(n->k->k.p, iter->unpacked.p) > 0)
                        iter->journal.idx = keys->gap - 1;
 }
 
@@ -994,7 +994,6 @@ static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
        if (ret)
                return ret;
 
-
        bkey_subvolume_init(&root_volume.k_i);
        root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
        root_volume.v.flags     = 0;
@@ -1087,12 +1086,6 @@ int bch2_fs_recovery(struct bch_fs *c)
                c->opts.fix_errors = FSCK_OPT_YES;
        }
 
-       if (!c->replicas.entries ||
-           c->opts.rebuild_replicas) {
-               bch_info(c, "building replicas info");
-               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-       }
-
        if (!c->opts.nochanges) {
                if (c->sb.version < bcachefs_metadata_version_new_data_types) {
                        bch_info(c, "version prior to new_data_types, upgrade and fsck required");
@@ -1102,6 +1095,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                }
        }
 
+       if (c->opts.fsck && c->opts.norecovery) {
+               bch_err(c, "cannot select both norecovery and fsck");
+               ret = -EINVAL;
+               goto err;
+       }
+
        ret = bch2_blacklist_table_initialize(c);
        if (ret) {
                bch_err(c, "error initializing blacklist table");
@@ -1195,6 +1194,13 @@ use_clean:
        if (ret)
                goto err;
 
+       /*
+        * Skip past versions that might have possibly been used (as nonces),
+        * but hadn't had their pointers written:
+        */
+       if (c->sb.encryption_type && !c->sb.clean)
+               atomic64_add(1 << 16, &c->key_version);
+
        ret = read_btree_roots(c);
        if (ret)
                goto err;
@@ -1217,17 +1223,9 @@ use_clean:
                goto err;
        bch_verbose(c, "stripes_read done");
 
-       /*
-        * If we're not running fsck, this ensures bch2_fsck_err() calls are
-        * instead interpreted as bch2_inconsistent_err() calls:
-        */
-       if (!c->opts.fsck)
-               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+       bch2_stripes_heap_start(c);
 
-       if (c->opts.fsck ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
-           test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
+       if (c->opts.fsck) {
                bool metadata_only = c->opts.norecovery;
 
                bch_info(c, "checking allocations");
@@ -1236,63 +1234,70 @@ use_clean:
                if (ret)
                        goto err;
                bch_verbose(c, "done checking allocations");
-       }
 
-       if (c->opts.fsck) {
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
                bch_info(c, "checking need_discard and freespace btrees");
                err = "error checking need_discard and freespace btrees";
                ret = bch2_check_alloc_info(c);
-               if (ret)
-                       goto err;
-
-               ret = bch2_check_lrus(c, true);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking need_discard and freespace btrees");
-       }
-
-       bch2_stripes_heap_start(c);
-
-       clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
-       /*
-        * Skip past versions that might have possibly been used (as nonces),
-        * but hadn't had their pointers written:
-        */
-       if (c->sb.encryption_type && !c->sb.clean)
-               atomic64_add(1 << 16, &c->key_version);
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 
-       if (c->opts.norecovery)
-               goto out;
+               bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
 
-       bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
-       err = "journal replay failed";
-       ret = bch2_journal_replay(c);
-       if (ret)
-               goto err;
-       if (c->opts.verbose || !c->sb.clean)
-               bch_info(c, "journal replay done");
+               bch_info(c, "checking lrus");
+               err = "error checking lrus";
+               ret = bch2_check_lrus(c, true);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking lrus");
 
-       err = "error initializing freespace";
-       ret = bch2_fs_freespace_init(c);
-       if (ret)
-               goto err;
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
 
-       if (c->opts.fsck) {
                bch_info(c, "checking alloc to lru refs");
                err = "error checking alloc to lru refs";
                ret = bch2_check_alloc_to_lru_refs(c);
                if (ret)
                        goto err;
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
 
                ret = bch2_check_lrus(c, true);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking alloc to lru refs");
+       } else {
+               set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags);
+               set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags);
+               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
+               if (c->opts.norecovery)
+                       goto out;
+
+               bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
+               err = "journal replay failed";
+               ret = bch2_journal_replay(c);
+               if (ret)
+                       goto err;
+               if (c->opts.verbose || !c->sb.clean)
+                       bch_info(c, "journal replay done");
        }
 
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
+       if (ret)
+               goto err;
+
        if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
                bch2_fs_lazy_rw(c);
 
index 6a81eb9b41a021e1ac68e21a4977fa7fc5ac3fe4..a53a3d53c8da0d958ccdfd0e68144d55f0f74c3a 100644 (file)
@@ -110,6 +110,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 }
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
@@ -124,7 +125,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans,
                }
        }
 
-       return bch2_trans_mark_extent(trans, old, new, flags);
+       return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
 }
 
 /* indirect inline data */
@@ -153,6 +154,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
 }
 
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
index e0a9d8e4d1caabc11e04629b06ec62b9bb849d60..f9848dc3eebbaeb770048d6c375d0a829a0f0d64 100644 (file)
@@ -20,8 +20,8 @@ int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
                           int, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, struct bkey_s_c,
-                             struct bkey_i *, unsigned);
+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+                             struct bkey_s_c, struct bkey_i *, unsigned);
 
 #define bch2_bkey_ops_reflink_v (struct bkey_ops) {            \
        .key_invalid    = bch2_reflink_v_invalid,               \
@@ -36,6 +36,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
 void bch2_indirect_inline_data_to_text(struct printbuf *,
                                struct bch_fs *, struct bkey_s_c);
 int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+                                        enum btree_id, unsigned,
                              struct bkey_s_c, struct bkey_i *,
                              unsigned);
 
index 63a57399cb7cefc51563910c48451313284f012d..81bdcb7795ae5e7599af087d6054eaab42cc6459 100644 (file)
@@ -565,7 +565,7 @@ static int snapshot_id_add(snapshot_id_list *s, u32 id)
 {
        BUG_ON(snapshot_list_has_id(s, id));
 
-       return darray_push(*s, id);
+       return darray_push(s, id);
 }
 
 static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
@@ -622,7 +622,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
        }
        bch2_trans_iter_exit(trans, &iter);
 
-       darray_exit(equiv_seen);
+       darray_exit(&equiv_seen);
 
        return ret;
 }
@@ -722,7 +722,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
                }
        }
 err:
-       darray_exit(deleted);
+       darray_exit(&deleted);
        bch2_trans_exit(&trans);
        percpu_ref_put(&c->writes);
 }
@@ -888,7 +888,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
        while (!ret) {
                mutex_lock(&c->snapshots_unlinked_lock);
                s = c->snapshots_unlinked;
-               darray_init(c->snapshots_unlinked);
+               darray_init(&c->snapshots_unlinked);
                mutex_unlock(&c->snapshots_unlinked_lock);
 
                if (!s.nr)
@@ -905,7 +905,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
                        }
                }
 
-               darray_exit(s);
+               darray_exit(&s);
        }
 
        percpu_ref_put(&c->writes);
index a4425389351524954bc306dec406778cd6c5a947..b1739d29c7d40af0a6082e42c4c5e0ea1cb567fb 100644 (file)
@@ -76,7 +76,7 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
 
 static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
 {
-       int ret = darray_push(s->ids, id);
+       int ret = darray_push(&s->ids, id);
        if (ret)
                bch_err(c, "error reallocating snapshots_seen table (size %zu)",
                        s->ids.size);
index 1aaae1400633a73db6bd044c34c96816c92f35c4..a2b789b4ac68a344dde31b82ffe1687373f0eecc 100644 (file)
@@ -17,6 +17,7 @@
 #include "super-io.h"
 #include "super.h"
 #include "vstructs.h"
+#include "counters.h"
 
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
@@ -818,6 +819,8 @@ int bch2_write_super(struct bch_fs *c)
 
        SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
+       bch2_sb_counters_from_cpu(c);
+
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
 
index 1401cb576fa2fbeb510b85107314491dbee58c5c..bdac2b7272334085204bf7fef5fb9e385abed9b1 100644 (file)
@@ -44,6 +44,7 @@
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "counters.h"
 
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
@@ -71,6 +72,9 @@ struct kobj_type type ## _ktype = {                                   \
 
 static void bch2_fs_release(struct kobject *);
 static void bch2_dev_release(struct kobject *);
+static void bch2_fs_counters_release(struct kobject *k)
+{
+}
 
 static void bch2_fs_internal_release(struct kobject *k)
 {
@@ -85,6 +89,7 @@ static void bch2_fs_time_stats_release(struct kobject *k)
 }
 
 static KTYPE(bch2_fs);
+static KTYPE(bch2_fs_counters);
 static KTYPE(bch2_fs_internal);
 static KTYPE(bch2_fs_opts_dir);
 static KTYPE(bch2_fs_time_stats);
@@ -188,57 +193,33 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i, clean_passes = 0;
+       u64 seq = 0;
 
        bch2_rebalance_stop(c);
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       /*
-        * Flush journal before stopping allocators, because flushing journal
-        * blacklist entries involves allocating new btree nodes:
-        */
-       bch2_journal_flush_all_pins(&c->journal);
-
        bch_verbose(c, "flushing journal and stopping allocators");
 
-       bch2_journal_flush_all_pins(&c->journal);
-
        do {
                clean_passes++;
 
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
-
-               /*
-                * In flight interior btree updates will generate more journal
-                * updates and btree updates (alloc btree):
-                */
-               if (bch2_btree_interior_updates_nr_pending(c)) {
-                       closure_wait_event(&c->btree_interior_update_wait,
-                                          !bch2_btree_interior_updates_nr_pending(c));
+               if (bch2_btree_interior_updates_flush(c) ||
+                   bch2_journal_flush_all_pins(&c->journal) ||
+                   bch2_btree_flush_all_writes(c) ||
+                   seq != atomic64_read(&c->journal.seq)) {
+                       seq = atomic64_read(&c->journal.seq);
                        clean_passes = 0;
                }
-               flush_work(&c->btree_interior_update_work);
-
-               if (bch2_journal_flush_all_pins(&c->journal))
-                       clean_passes = 0;
        } while (clean_passes < 2);
-       bch_verbose(c, "flushing journal and stopping allocators complete");
-
-       set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
-       flush_work(&c->btree_interior_update_work);
+       bch_verbose(c, "flushing journal and stopping allocators complete");
 
+       if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
+           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
        bch2_fs_journal_stop(&c->journal);
 
-       /*
-        * the journal kicks off btree writes via reclaim - wait for in flight
-        * writes after stopping journal:
-        */
-       bch2_btree_flush_all_writes(c);
-
        /*
         * After stopping journal:
         */
@@ -297,7 +278,7 @@ void bch2_fs_read_only(struct bch_fs *c)
            !test_bit(BCH_FS_ERROR, &c->flags) &&
            !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
            test_bit(BCH_FS_STARTED, &c->flags) &&
-           test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
+           test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
            !c->opts.norecovery) {
                bch_verbose(c, "marking filesystem clean");
                bch2_fs_mark_clean(c);
@@ -388,7 +369,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        if (ret)
                goto err;
 
-       clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
+       clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
 
        for_each_rw_member(ca, c, i)
                bch2_dev_allocator_add(c, ca);
@@ -517,6 +498,7 @@ void __bch2_fs_stop(struct bch_fs *c)
        bch2_fs_debug_exit(c);
        bch2_fs_chardev_exit(c);
 
+       kobject_put(&c->counters_kobj);
        kobject_put(&c->time_stats);
        kobject_put(&c->opts_dir);
        kobject_put(&c->internal);
@@ -585,6 +567,7 @@ static int bch2_fs_online(struct bch_fs *c)
            kobject_add(&c->internal, &c->kobj, "internal") ?:
            kobject_add(&c->opts_dir, &c->kobj, "options") ?:
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+           kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
            bch2_opts_create_sysfs_files(&c->opts_dir);
        if (ret) {
                bch_err(c, "error creating sysfs objects");
@@ -633,6 +616,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        kobject_init(&c->internal, &bch2_fs_internal_ktype);
        kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
        kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+       kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype);
 
        c->minor                = -1;
        c->disk_sb.fs_sb        = true;
@@ -796,7 +780,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_encryption_init(c) ?:
            bch2_fs_compress_init(c) ?:
            bch2_fs_ec_init(c) ?:
-           bch2_fs_fsio_init(c);
+           bch2_fs_fsio_init(c) ?:
+           bch2_fs_counters_init(c);
        if (ret)
                goto err;
 
index d3919fa4a4553e49267dff37f6e036721ba938cd..77e2ec73319eae6254b53f517b9e7718aa9ec858 100644 (file)
@@ -40,7 +40,7 @@
 #include "util.h"
 
 #define SYSFS_OPS(type)                                                        \
-struct sysfs_ops type ## _sysfs_ops = {                                        \
+const struct sysfs_ops type ## _sysfs_ops = {                                  \
        .show   = type ## _show,                                        \
        .store  = type ## _store                                        \
 }
@@ -55,6 +55,9 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
        struct printbuf out = PRINTBUF;                                 \
        ssize_t ret = fn ## _to_text(&out, kobj, attr);                 \
                                                                        \
+       if (out.pos && out.buf[out.pos - 1] != '\n')                    \
+               pr_newline(&out);                                       \
+                                                                       \
        if (!ret && out.allocation_failure)                             \
                ret = -ENOMEM;                                          \
                                                                        \
@@ -191,6 +194,10 @@ read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
 read_attribute(bucket_alloc_fail);
 
+#define x(t, n, ...) read_attribute(t);
+BCH_PERSISTENT_COUNTERS()
+#undef x
+
 rw_attribute(discard);
 rw_attribute(label);
 
@@ -544,6 +551,47 @@ struct attribute *bch2_fs_files[] = {
        NULL
 };
 
+/* counters dir */
+
+SHOW(bch2_fs_counters)
+{
+       struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj);
+       u64 counter = 0;
+       u64 counter_since_mount = 0;
+
+       out->tabstops[0] = 32;
+       #define x(t, ...) \
+               if (attr == &sysfs_##t) {                                       \
+                       counter             = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\
+                       counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\
+                       pr_buf(out, "since mount:");                            \
+                       pr_tab(out);                                            \
+                       bch2_hprint(out, counter_since_mount << 9);             \
+                       pr_newline(out);                                        \
+                                                                               \
+                       pr_buf(out, "since filesystem creation:");              \
+                       pr_tab(out);                                            \
+                       bch2_hprint(out, counter << 9);                         \
+                       pr_newline(out);                                        \
+               }
+       BCH_PERSISTENT_COUNTERS()
+       #undef x
+       return 0;
+}
+
+STORE(bch2_fs_counters) {
+       return 0;
+}
+
+SYSFS_OPS(bch2_fs_counters);
+
+struct attribute *bch2_fs_counters_files[] = {
+#define x(t, ...) \
+       &sysfs_##t,
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       NULL
+};
 /* internal dir - just a wrapper */
 
 SHOW(bch2_fs_internal)
@@ -614,7 +662,7 @@ STORE(bch2_fs_opts_dir)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
        const struct bch_option *opt = container_of(attr, struct bch_option, attr);
-       int ret = size, id = opt - bch2_opt_table;
+       int ret, id = opt - bch2_opt_table;
        char *tmp;
        u64 v;
 
@@ -649,6 +697,8 @@ STORE(bch2_fs_opts_dir)
                bch2_rebalance_add_work(c, S64_MAX);
                rebalance_wakeup(c);
        }
+
+       ret = size;
 err:
        percpu_ref_put(&c->writes);
        return ret;
index 525fd05d91f7d003519e17a82e876e83157db9b3..222cd5062702cdd6a54335dcd058b95320867f8f 100644 (file)
@@ -10,28 +10,32 @@ struct attribute;
 struct sysfs_ops;
 
 extern struct attribute *bch2_fs_files[];
+extern struct attribute *bch2_fs_counters_files[];
 extern struct attribute *bch2_fs_internal_files[];
 extern struct attribute *bch2_fs_opts_dir_files[];
 extern struct attribute *bch2_fs_time_stats_files[];
 extern struct attribute *bch2_dev_files[];
 
-extern struct sysfs_ops bch2_fs_sysfs_ops;
-extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
-extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
-extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
-extern struct sysfs_ops bch2_dev_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
+extern const struct sysfs_ops bch2_dev_sysfs_ops;
 
 int bch2_opts_create_sysfs_files(struct kobject *);
 
 #else
 
 static struct attribute *bch2_fs_files[] = {};
+static struct attribute *bch2_fs_counters_files[] = {};
 static struct attribute *bch2_fs_internal_files[] = {};
 static struct attribute *bch2_fs_opts_dir_files[] = {};
 static struct attribute *bch2_fs_time_stats_files[] = {};
 static struct attribute *bch2_dev_files[] = {};
 
 static const struct sysfs_ops bch2_fs_sysfs_ops;
+static const struct sysfs_ops bch2_fs_counters_sysfs_ops;
 static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
 static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
 static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
index a2d6bb7136c7d412d95469ac09b2c063fd0dd86d..5143b603bf67ff397181e70a9705cfa100237fbf 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/bitops.h>
+#include <linux/math.h>
 #include <linux/string.h>
 #include <asm/unaligned.h>
 
index e338152185b88a8d8702af5d2fa81c111441b968..591e2a0c45b8cb92e21f55a0b12848932523e4a1 100644 (file)
@@ -306,10 +306,10 @@ void ranges_sort_merge(ranges *r)
                if (t && t->end >= i->start)
                        t->end = max(t->end, i->end);
                else
-                       darray_push(tmp, *i);
+                       darray_push(&tmp, *i);
        }
 
-       darray_exit(*r);
+       darray_exit(r);
        *r = tmp;
 }
 
index a0e20ebf6eb970eb73f82c649dd7f0a66dbb437d..136d7d6585e3d125b27fb6b76f1c160d9715bc42 100644 (file)
@@ -76,7 +76,7 @@ typedef DARRAY(struct range) ranges;
 
 static inline void range_add(ranges *data, u64 offset, u64 size)
 {
-       darray_push(*data, ((struct range) {
+       darray_push(data, ((struct range) {
                .start = offset,
                .end = offset + size
        }));