]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 481b5f343248 bcachefs: Better error messages for missing...
authorKent Overstreet <kent.overstreet@linux.dev>
Wed, 17 Jan 2024 03:45:53 +0000 (22:45 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Wed, 24 Jan 2024 22:36:11 +0000 (17:36 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
92 files changed:
.bcachefs_revision
c_src/cmd_dump.c
c_src/cmd_format.c
c_src/cmd_fs.c
c_src/cmd_migrate.c
c_src/libbcachefs.c
c_src/libbcachefs.h
c_src/tools-util.c
include/linux/blkdev.h
include/linux/poison.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background_format.h [new file with mode: 0644]
libbcachefs/alloc_foreground.c
libbcachefs/backpointers.c
libbcachefs/backpointers.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_locking.c
libbcachefs/btree_locking.h
libbcachefs/btree_trans_commit.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_write_buffer.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/clock.c
libbcachefs/compress.h
libbcachefs/data_update.c
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent_format.h [new file with mode: 0644]
libbcachefs/ec.c
libbcachefs/ec_format.h [new file with mode: 0644]
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/extents_format.h [new file with mode: 0644]
libbcachefs/eytzinger.h
libbcachefs/fs-io-direct.c
libbcachefs/fs-io-pagecache.c
libbcachefs/fs-io-pagecache.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode_format.h [new file with mode: 0644]
libbcachefs/io_misc.c
libbcachefs/io_write.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/logged_ops_format.h [new file with mode: 0644]
libbcachefs/move.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/quota_format.h [new file with mode: 0644]
libbcachefs/rebalance.c
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/reflink_format.h [new file with mode: 0644]
libbcachefs/replicas.c
libbcachefs/sb-clean.c
libbcachefs/sb-counters.c [moved from libbcachefs/counters.c with 99% similarity]
libbcachefs/sb-counters.h [moved from libbcachefs/counters.h with 77% similarity]
libbcachefs/sb-counters_format.h [new file with mode: 0644]
libbcachefs/sb-members.c
libbcachefs/snapshot.c
libbcachefs/snapshot_format.h [new file with mode: 0644]
libbcachefs/subvolume_format.h [new file with mode: 0644]
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/trace.h
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/xattr.c
libbcachefs/xattr_format.h [new file with mode: 0644]
linux/blkdev.c

index 236a97d5640e9308930514ccc2b8a49fc83d86e7..797e13368b786a0eb6037554f2e68b80d2f68c9e 100644 (file)
@@ -1 +1 @@
-cbb2e45634dd3b6ae38e45856ab6215c687a8806
+481b5f34324809f47a58ed798d038fb17e5b7b0a
index 51cc876b5558a48270ce4d34606504e928c11278..0ffaf98d0ad605d067a743f6579a0fcee4683a76 100644 (file)
@@ -10,6 +10,7 @@
 
 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/btree_io.h"
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/error.h"
 #include "libbcachefs/extents.h"
@@ -76,7 +77,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
                                        if (ptr->dev == ca->dev_idx)
                                                range_add(&data,
                                                          ptr->offset << 9,
-                                                         btree_bytes(c));
+                                                         btree_ptr_sectors_written(&b->key));
                        }
                }
 
@@ -91,7 +92,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
                                if (ptr->dev == ca->dev_idx)
                                        range_add(&data,
                                                  ptr->offset << 9,
-                                                 btree_bytes(c));
+                                                 btree_ptr_sectors_written(&b->key));
                }
 
                bch2_trans_iter_exit(trans, &iter);
@@ -99,7 +100,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
        }
 
        qcow2_write_image(ca->disk_sb.bdev->bd_fd, fd, &data,
-                         max_t(unsigned, btree_bytes(c) / 8, block_bytes(c)));
+                         max_t(unsigned, c->opts.btree_node_size / 8, block_bytes(c)));
        darray_exit(&data);
 }
 
index 45c44e3229d0f5ff50cf5b21d3bd6d1042267f1c..6b77763e2ff921a8f4d57d8992fb57c281f741f5 100644 (file)
@@ -188,7 +188,7 @@ int cmd_format(int argc, char *argv[])
                case O_data_allowed:
                        dev_opts.data_allowed =
                                read_flag_list_or_die(optarg,
-                                       bch2_data_types, "data type");
+                                       __bch2_data_types, "data type");
                        unconsumed_dev_option = true;
                        break;
                case O_durability:
index 67c38af6eafe73f9d9ae8d622d7ab350007513ca..1a5d144b6d8c3e4927a1aa572e097d160cf8aa8b 100644 (file)
@@ -5,8 +5,10 @@
 #include <uuid/uuid.h>
 
 #include "linux/sort.h"
+#include "linux/rcupdate.h"
 
 #include "libbcachefs/bcachefs_ioctl.h"
+#include "libbcachefs/buckets.h"
 #include "libbcachefs/darray.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super-io.h"
 #include "libbcachefs.h"
 
 static void __dev_usage_type_to_text(struct printbuf *out,
-                                    const char *type,
+                                    enum bch_data_type type,
                                     unsigned bucket_size,
                                     u64 buckets, u64 sectors, u64 frag)
 {
-       prt_printf(out, "%s:", type);
+       bch2_prt_data_type(out, type);
+       prt_char(out, ':');
        prt_tab(out);
 
        prt_units_u64(out, sectors << 9);
@@ -51,7 +54,7 @@ static void dev_usage_type_to_text(struct printbuf *out,
                sectors = u->d[type].sectors;
        }
 
-       __dev_usage_type_to_text(out, bch2_data_types[type],
+       __dev_usage_type_to_text(out, type,
                        u->bucket_size,
                        u->d[type].buckets,
                        sectors,
@@ -153,7 +156,8 @@ static void replicas_usage_to_text(struct printbuf *out,
        *d++ = ']';
        *d++ = '\0';
 
-       prt_printf(out, "%s: ", bch2_data_types[r->r.data_type]);
+       bch2_prt_data_type(out, r->r.data_type);
+       prt_char(out, ':');
        prt_tab(out);
 
        prt_printf(out, "%u/%u ", r->r.nr_required, r->r.nr_devs);
index ea32e4ee497b7cf7ea178cced1c4ec65a0fedbe4..1c7cc929be0d9d6b84a73e4efeece9e61fd74cb5 100644 (file)
@@ -676,7 +676,12 @@ static int migrate_fs(const char           *fs_path,
        struct dev_opts dev = dev_opts_default();
 
        dev.path = dev_t_to_path(stat.st_dev);
-       dev.bdev = blkdev_get_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
+       dev.handle = bdev_open_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
+
+       int ret = PTR_ERR_OR_ZERO(dev.handle);
+       if (ret < 0)
+               die("Error opening device to format %s: %s", dev.path, strerror(-ret));
+       dev.bdev = dev.handle->bdev;
 
        opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_fd));
 
@@ -722,7 +727,7 @@ static int migrate_fs(const char            *fs_path,
 
        mark_unreserved_space(c, extents);
 
-       int ret = bch2_fs_start(c);
+       ret = bch2_fs_start(c);
        if (ret)
                die("Error starting new filesystem: %s", bch2_err_str(ret));
 
index ef4cc7181f877e30f1b976f45051160eeda875ac..703a0eca21358270f298e64e91c16502f9dbd80e 100644 (file)
@@ -20,6 +20,7 @@
 #include "crypto.h"
 #include "libbcachefs/bcachefs_format.h"
 #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/buckets.h"
 #include "libbcachefs/checksum.h"
 #include "libbcachefs/disk_groups.h"
 #include "libbcachefs/journal_seq_blacklist.h"
@@ -531,7 +532,7 @@ int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
                       e.p.sectors_total
                       ? e.p.sectors_done * 100 / e.p.sectors_total
                       : 0,
-                      bch2_data_types[e.p.data_type]);
+                      bch2_data_type_str(e.p.data_type));
 
                switch (e.p.data_type) {
                case BCH_DATA_btree:
index b189a208115f63266f1bfeba6ac47ca43fc05c45..739783f0ebdb7f11df8fecbbb6fb360b9ca968ac 100644 (file)
@@ -52,6 +52,7 @@ static inline struct format_opts format_opts_default()
 }
 
 struct dev_opts {
+       struct bdev_handle *handle;
        struct block_device *bdev;
        char            *path;
        u64             size;           /* bytes*/
index a1bcd8eba550ded5df147dd5bfef012e8267502c..c0b6852a2534e04ecfdfee73edf518f978c61ad9 100644 (file)
@@ -189,12 +189,13 @@ int open_for_format(struct dev_opts *dev, bool force)
        const char *fs_type = NULL, *fs_label = NULL;
        size_t fs_type_len, fs_label_len;
 
-       dev->bdev = blkdev_get_by_path(dev->path,
+       dev->handle = bdev_open_by_path(dev->path,
                                BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL|BLK_OPEN_BUFFERED,
                                dev, NULL);
-       int ret = PTR_ERR_OR_ZERO(dev->bdev);
+       int ret = PTR_ERR_OR_ZERO(dev->handle);
        if (ret < 0)
                die("Error opening device to format %s: %s", dev->path, strerror(-ret));
+       dev->bdev = dev->handle->bdev;
 
        if (!(pr = blkid_new_probe()))
                die("blkid error 1");
index 39143117c1a9bf1e22a8cec790eb1e88d426257d..998f5e2c6686a150eded1aa729c9cf8dddf1a3bd 100644 (file)
@@ -89,10 +89,15 @@ struct blk_holder_ops {
         void (*mark_dead)(struct block_device *bdev);
 };
 
-void blkdev_put(struct block_device *bdev, void *holder);
-void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
-                                       void *holder, const struct blk_holder_ops *hop);
+struct bdev_handle {
+       struct block_device *bdev;
+       void *holder;
+       blk_mode_t mode;
+};
+
+void bdev_release(struct bdev_handle *);
+struct bdev_handle *bdev_open_by_path(const char *, blk_mode_t, void *,
+                                     const struct blk_holder_ops *);
 int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
index 851a855d386884177eb50db59a67821168c2258d..27a7dad17eefb83b917569fdc6a5df7298f3859b 100644 (file)
@@ -83,6 +83,8 @@
 
 /********** net/core/skbuff.c **********/
 #define SKB_LIST_POISON_NEXT   ((void *)(0x800 + POISON_POINTER_DELTA))
+/********** net/ **********/
+#define NET_PTR_POISON         ((void *)(0x801 + POISON_POINTER_DELTA))
 
 /********** kernel/bpf/ **********/
 #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
index a09b9d00226a4e1dd510c0c097ac59e7cb7d3c77..fd3e175d83423261d68124cd26fc0351488ad05e 100644 (file)
@@ -273,7 +273,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
                bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
                                 c, err, alloc_key_dirty_sectors_0,
                                 "data_type %s but dirty_sectors==0",
-                                bch2_data_types[a.v->data_type]);
+                                bch2_data_type_str(a.v->data_type));
                break;
        case BCH_DATA_cached:
                bkey_fsck_err_on(!a.v->cached_sectors ||
@@ -321,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 {
        struct bch_alloc_v4 _a;
        const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
-       unsigned i;
 
        prt_newline(out);
        printbuf_indent_add(out, 2);
 
-       prt_printf(out, "gen %u oldest_gen %u data_type %s",
-              a->gen, a->oldest_gen,
-              a->data_type < BCH_DATA_NR
-              ? bch2_data_types[a->data_type]
-              : "(invalid data type)");
+       prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+       bch2_prt_data_type(out, a->data_type);
        prt_newline(out);
        prt_printf(out, "journal_seq       %llu",       a->journal_seq);
        prt_newline(out);
@@ -353,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        prt_printf(out, "fragmentation     %llu",       a->fragmentation_lru);
        prt_newline(out);
        prt_printf(out, "bp_start          %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
-       prt_newline(out);
-
-       if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
-               struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
-               const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
-               prt_printf(out, "backpointers:     %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
-               printbuf_indent_add(out, 2);
-
-               for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
-                       prt_newline(out);
-                       bch2_backpointer_to_text(out, &bps[i]);
-               }
-
-               printbuf_indent_sub(out, 2);
-       }
-
        printbuf_indent_sub(out, 2);
 }
 
@@ -839,7 +818,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
                }
        }
 
-       if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+       if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
                struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
                u64 journal_seq = trans->journal_res.seq;
                u64 bucket_journal_seq = new_a->journal_seq;
@@ -1625,13 +1604,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
        return ret;
 }
 
+struct discard_buckets_state {
+       u64             seen;
+       u64             open;
+       u64             need_journal_commit;
+       u64             discarded;
+       struct bch_dev  *ca;
+       u64             need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+       if (s->ca == ca)
+               return;
+
+       if (s->ca && s->need_journal_commit_this_dev >
+           bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+               bch2_journal_flush_async(&c->journal, NULL);
+
+       if (s->ca)
+               percpu_ref_put(&s->ca->ref);
+       if (ca)
+               percpu_ref_get(&ca->ref);
+       s->ca = ca;
+       s->need_journal_commit_this_dev = 0;
+}
+
 static int bch2_discard_one_bucket(struct btree_trans *trans,
                                   struct btree_iter *need_discard_iter,
                                   struct bpos *discard_pos_done,
-                                  u64 *seen,
-                                  u64 *open,
-                                  u64 *need_journal_commit,
-                                  u64 *discarded)
+                                  struct discard_buckets_state *s)
 {
        struct bch_fs *c = trans->c;
        struct bpos pos = need_discard_iter->pos;
@@ -1643,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
        int ret = 0;
 
        ca = bch_dev_bkey_exists(c, pos.inode);
+
        if (!percpu_ref_tryget(&ca->io_ref)) {
                bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
                return 0;
        }
 
+       discard_buckets_next_dev(c, s, ca);
+
        if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
-               (*open)++;
+               s->open++;
                goto out;
        }
 
        if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
                        c->journal.flushed_seq_ondisk,
                        pos.inode, pos.offset)) {
-               (*need_journal_commit)++;
+               s->need_journal_commit++;
+               s->need_journal_commit_this_dev++;
                goto out;
        }
 
@@ -1709,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
                 * This works without any other locks because this is the only
                 * thread that removes items from the need_discard tree
                 */
-               bch2_trans_unlock(trans);
+               bch2_trans_unlock_long(trans);
                blkdev_issue_discard(ca->disk_sb.bdev,
                                     k.k->p.offset * ca->mi.bucket_size,
                                     ca->mi.bucket_size,
@@ -1732,9 +1738,9 @@ write:
                goto out;
 
        count_event(c, bucket_discard);
-       (*discarded)++;
+       s->discarded++;
 out:
-       (*seen)++;
+       s->seen++;
        bch2_trans_iter_exit(trans, &iter);
        percpu_ref_put(&ca->io_ref);
        printbuf_exit(&buf);
@@ -1744,7 +1750,7 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-       u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+       struct discard_buckets_state s = {};
        struct bpos discard_pos_done = POS_MAX;
        int ret;
 
@@ -1756,19 +1762,14 @@ static void bch2_do_discards_work(struct work_struct *work)
        ret = bch2_trans_run(c,
                for_each_btree_key(trans, iter,
                                   BTREE_ID_need_discard, POS_MIN, 0, k,
-                       bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
-                                               &seen,
-                                               &open,
-                                               &need_journal_commit,
-                                               &discarded)));
-
-       if (need_journal_commit * 2 > seen)
-               bch2_journal_flush_async(&c->journal, NULL);
+                       bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
 
-       bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+       discard_buckets_next_dev(c, &s, NULL);
 
-       trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+       trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
                              bch2_err_str(ret));
+
+       bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
 void bch2_do_discards(struct bch_fs *c)
diff --git a/libbcachefs/alloc_background_format.h b/libbcachefs/alloc_background_format.h
new file mode 100644 (file)
index 0000000..b4ec20b
--- /dev/null
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+       struct bch_val          v;
+       __u8                    fields;
+       __u8                    gen;
+       __u8                    data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1()                  \
+       x(read_time,            16)             \
+       x(write_time,           16)             \
+       x(data_type,            8)              \
+       x(dirty_sectors,        16)             \
+       x(cached_sectors,       16)             \
+       x(oldest_gen,           8)              \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+       BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+       struct bch_val          v;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2()                  \
+       x(read_time,            64)             \
+       x(write_time,           64)             \
+       x(dirty_sectors,        32)             \
+       x(cached_sectors,       32)             \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
+
+struct bch_alloc_v3 {
+       struct bch_val          v;
+       __le64                  journal_seq;
+       __le32                  flags;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
+struct bch_alloc_v4 {
+       struct bch_val          v;
+       __u64                   journal_seq;
+       __u32                   flags;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    stripe_redundancy;
+       __u32                   dirty_sectors;
+       __u32                   cached_sectors;
+       __u64                   io_time[2];
+       __u32                   stripe;
+       __u32                   nr_external_backpointers;
+       __u64                   fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0   6
+#define BCH_ALLOC_V4_U64s      (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD,     struct bch_alloc_v4, flags,  0,  1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,     struct bch_alloc_v4, flags,  1,  2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,  struct bch_alloc_v4, flags,  8,  14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS      8
+#define KEY_TYPE_BUCKET_GENS_NR                (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK      (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+       struct bch_val          v;
+       u8                      gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
index b0ff47998a9440912f940dc09e27b34e6341cb9e..633d3223b353f83e83501601024dd262952236c6 100644 (file)
@@ -1525,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
        unsigned data_type = ob->data_type;
        barrier(); /* READ_ONCE() doesn't work on bitfields */
 
-       prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+       prt_printf(out, "%zu ref %u ",
                   ob - c->open_buckets,
-                  atomic_read(&ob->pin),
-                  data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+                  atomic_read(&ob->pin));
+       bch2_prt_data_type(out, data_type);
+       prt_printf(out, " %u:%llu gen %u allocated %u/%u",
                   ob->dev, ob->bucket, ob->gen,
                   ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
        if (ob->ec)
index e358a2ffffdea48c80eee18ab299cd7103d72991..b4dc319bcb2bc0a5363e74f6d2096d3b5652599d 100644 (file)
@@ -400,13 +400,24 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
        return ret;
 }
 
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+       return bpos_eq(l.k->p, r.k->p) &&
+               bkey_bytes(l.k) == bkey_bytes(r.k) &&
+               !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+       struct bpos     bucket_start;
+       struct bpos     bucket_end;
+       struct bkey_buf last_flushed;
+};
+
 static int check_bp_exists(struct btree_trans *trans,
+                          struct extents_to_bp_state *s,
                           struct bpos bucket,
                           struct bch_backpointer bp,
-                          struct bkey_s_c orig_k,
-                          struct bpos bucket_start,
-                          struct bpos bucket_end,
-                          struct bkey_buf *last_flushed)
+                          struct bkey_s_c orig_k)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter bp_iter = { NULL };
@@ -417,8 +428,8 @@ static int check_bp_exists(struct btree_trans *trans,
 
        bch2_bkey_buf_init(&tmp);
 
-       if (bpos_lt(bucket, bucket_start) ||
-           bpos_gt(bucket, bucket_end))
+       if (bpos_lt(bucket, s->bucket_start) ||
+           bpos_gt(bucket, s->bucket_end))
                return 0;
 
        if (!bch2_dev_bucket_exists(c, bucket))
@@ -433,11 +444,9 @@ static int check_bp_exists(struct btree_trans *trans,
 
        if (bp_k.k->type != KEY_TYPE_backpointer ||
            memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
-               if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) ||
-                   bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) ||
-                   memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) {
-                       bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+               bch2_bkey_buf_reassemble(&tmp, c, orig_k);
 
+               if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
                        if (bp.level) {
                                bch2_trans_unlock(trans);
                                bch2_btree_interior_updates_flush(c);
@@ -447,7 +456,7 @@ static int check_bp_exists(struct btree_trans *trans,
                        if (ret)
                                goto err;
 
-                       bch2_bkey_buf_copy(last_flushed, c, tmp.k);
+                       bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
                        ret = -BCH_ERR_transaction_restart_write_buffer_flush;
                        goto out;
                }
@@ -475,10 +484,8 @@ missing:
 }
 
 static int check_extent_to_backpointers(struct btree_trans *trans,
+                                       struct extents_to_bp_state *s,
                                        enum btree_id btree, unsigned level,
-                                       struct bpos bucket_start,
-                                       struct bpos bucket_end,
-                                       struct bkey_buf *last_flushed,
                                        struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
@@ -498,9 +505,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
                bch2_extent_ptr_to_bp(c, btree, level,
                                      k, p, &bucket_pos, &bp);
 
-               ret = check_bp_exists(trans, bucket_pos, bp, k,
-                                     bucket_start, bucket_end,
-                                     last_flushed);
+               ret = check_bp_exists(trans, s, bucket_pos, bp, k);
                if (ret)
                        return ret;
        }
@@ -509,10 +514,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 }
 
 static int check_btree_root_to_backpointers(struct btree_trans *trans,
+                                           struct extents_to_bp_state *s,
                                            enum btree_id btree_id,
-                                           struct bpos bucket_start,
-                                           struct bpos bucket_end,
-                                           struct bkey_buf *last_flushed,
                                            int *level)
 {
        struct bch_fs *c = trans->c;
@@ -536,9 +539,7 @@ retry:
        *level = b->c.level;
 
        k = bkey_i_to_s_c(&b->key);
-       ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
-                                     bucket_start, bucket_end,
-                                     last_flushed, k);
+       ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
 err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -559,7 +560,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
 
        si_meminfo(&i);
        mem_bytes = i.totalram * i.mem_unit;
-       return div_u64(mem_bytes >> 1, btree_bytes(c));
+       return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
 }
 
 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,43 +611,35 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
 }
 
 static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
-                                                  struct bpos bucket_start,
-                                                  struct bpos bucket_end)
+                                                  struct extents_to_bp_state *s)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       enum btree_id btree_id;
-       struct bkey_s_c k;
-       struct bkey_buf last_flushed;
        int ret = 0;
 
-       bch2_bkey_buf_init(&last_flushed);
-       bkey_init(&last_flushed.k->k);
-
-       for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
+       for (enum btree_id btree_id = 0;
+            btree_id < btree_id_nr_alive(c);
+            btree_id++) {
                int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
 
                ret = commit_do(trans, NULL, NULL,
                                BCH_TRANS_COMMIT_no_enospc,
-                               check_btree_root_to_backpointers(trans, btree_id,
-                                                       bucket_start, bucket_end,
-                                                       &last_flushed, &level));
+                               check_btree_root_to_backpointers(trans, s, btree_id, &level));
                if (ret)
                        return ret;
 
                while (level >= depth) {
+                       struct btree_iter iter;
                        bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
                                                  level,
                                                  BTREE_ITER_PREFETCH);
                        while (1) {
                                bch2_trans_begin(trans);
-                               k = bch2_btree_iter_peek(&iter);
+
+                               struct bkey_s_c k = bch2_btree_iter_peek(&iter);
                                if (!k.k)
                                        break;
                                ret = bkey_err(k) ?:
-                                       check_extent_to_backpointers(trans, btree_id, level,
-                                                                    bucket_start, bucket_end,
-                                                                    &last_flushed, k) ?:
+                                       check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
                                        bch2_trans_commit(trans, NULL, NULL,
                                                          BCH_TRANS_COMMIT_no_enospc);
                                if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
@@ -668,7 +661,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
                }
        }
 
-       bch2_bkey_buf_exit(&last_flushed, c);
        return 0;
 }
 
@@ -731,37 +723,43 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
        struct btree_trans *trans = bch2_trans_get(c);
-       struct bpos start = POS_MIN, end;
+       struct extents_to_bp_state s = { .bucket_start = POS_MIN };
        int ret;
 
+       bch2_bkey_buf_init(&s.last_flushed);
+       bkey_init(&s.last_flushed.k->k);
+
        while (1) {
-               ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+               ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
                if (ret)
                        break;
 
-               if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+               if ( bpos_eq(s.bucket_start, POS_MIN) &&
+                   !bpos_eq(s.bucket_end, SPOS_MAX))
                        bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
                                    __func__, btree_nodes_fit_in_ram(c));
 
-               if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+               if (!bpos_eq(s.bucket_start, POS_MIN) ||
+                   !bpos_eq(s.bucket_end, SPOS_MAX)) {
                        struct printbuf buf = PRINTBUF;
 
                        prt_str(&buf, "check_extents_to_backpointers(): ");
-                       bch2_bpos_to_text(&buf, start);
+                       bch2_bpos_to_text(&buf, s.bucket_start);
                        prt_str(&buf, "-");
-                       bch2_bpos_to_text(&buf, end);
+                       bch2_bpos_to_text(&buf, s.bucket_end);
 
                        bch_verbose(c, "%s", buf.buf);
                        printbuf_exit(&buf);
                }
 
-               ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
-               if (ret || bpos_eq(end, SPOS_MAX))
+               ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+               if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
                        break;
 
-               start = bpos_successor(end);
+               s.bucket_start = bpos_successor(s.bucket_end);
        }
        bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&s.last_flushed, c);
 
        bch_err_fn(c, ret);
        return ret;
index 737e2396ade7ec44edf4f18738e286b5da3189bd..327365a9feac4e8fa69575ec6fe6157fd3edb127 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
 #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
 
+#include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "buckets.h"
index dac383e3718163b6566eb2e6a4ff305fb65da715..b80c6c9efd8cef95b46b5b45b21f639e18373755 100644 (file)
@@ -1204,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
        return c->opts.block_size >> 9;
 }
 
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
-       return c->opts.btree_node_size >> 9;
-}
-
 static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
 {
        return c->btree_key_cache_btrees & (1U << btree);
index 0d5ac4184fbcef5a2b7ae618d6bdf81478f09530..0668b682a21ca8e035cae73f73e6774c99eaeb94 100644 (file)
@@ -417,600 +417,12 @@ struct bch_set {
        struct bch_val          v;
 };
 
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32    - 0b1
- * bch_extent_ptr      - 0b10
- * bch_extent_crc64    - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
 /* 128 bits, sufficient for cryptographic MACs: */
 struct bch_csum {
        __le64                  lo;
        __le64                  hi;
 } __packed __aligned(8);
 
-#define BCH_EXTENT_ENTRY_TYPES()               \
-       x(ptr,                  0)              \
-       x(crc32,                1)              \
-       x(crc64,                2)              \
-       x(crc128,               3)              \
-       x(stripe_ptr,           4)              \
-       x(rebalance,            5)
-#define BCH_EXTENT_ENTRY_MAX   6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
-       BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u32                   type:2,
-                               _compressed_size:7,
-                               _uncompressed_size:7,
-                               offset:7,
-                               _unused:1,
-                               csum_type:4,
-                               compression_type:4;
-       __u32                   csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u32                   csum;
-       __u32                   compression_type:4,
-                               csum_type:4,
-                               _unused:1,
-                               offset:7,
-                               _uncompressed_size:7,
-                               _compressed_size:7,
-                               type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX         (1U << 7)
-#define CRC32_NONCE_MAX                0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:3,
-                               _compressed_size:9,
-                               _uncompressed_size:9,
-                               offset:9,
-                               nonce:10,
-                               csum_type:4,
-                               compression_type:4,
-                               csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   csum_hi:16,
-                               compression_type:4,
-                               csum_type:4,
-                               nonce:10,
-                               offset:9,
-                               _uncompressed_size:9,
-                               _compressed_size:9,
-                               type:3;
-#endif
-       __u64                   csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX         (1U << 9)
-#define CRC64_NONCE_MAX                ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:4,
-                               _compressed_size:13,
-                               _uncompressed_size:13,
-                               offset:13,
-                               nonce:13,
-                               csum_type:4,
-                               compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   compression_type:4,
-                               csum_type:4,
-                               nonce:13,
-                               offset:13,
-                               _uncompressed_size:13,
-                               _compressed_size:13,
-                               type:4;
-#endif
-       struct bch_csum         csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX                (1U << 13)
-#define CRC128_NONCE_MAX       ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:1,
-                               cached:1,
-                               unused:1,
-                               unwritten:1,
-                               offset:44, /* 8 petabytes */
-                               dev:8,
-                               gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   gen:8,
-                               dev:8,
-                               offset:44,
-                               unwritten:1,
-                               unused:1,
-                               cached:1,
-                               type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:5,
-                               block:8,
-                               redundancy:4,
-                               idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   idx:47,
-                               redundancy:4,
-                               block:8,
-                               type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:6,
-                               unused:34,
-                               compression:8, /* enum bch_compression_opt */
-                               target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   target:16,
-                               compression:8,
-                               unused:34,
-                               type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
-       unsigned long                   type;
-#elif __BITS_PER_LONG == 32
-       struct {
-               unsigned long           pad;
-               unsigned long           type;
-       };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f  f;
-       BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
-       struct bch_val          v;
-
-       __u64                   _data[0];
-       struct bch_extent_ptr   start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
-       struct bch_val          v;
-
-       __u64                   mem_ptr;
-       __le64                  seq;
-       __le16                  sectors_written;
-       __le16                  flags;
-       struct bpos             min_key;
-       __u64                   _data[0];
-       struct bch_extent_ptr   start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
-       struct bch_val          v;
-
-       __u64                   _data[0];
-       union bch_extent_entry  start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
-       struct bch_val          v;
-
-       __le32                  generation;
-       __u8                    nr_replicas;
-       __u8                    pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
-       ((sizeof(struct bch_extent_crc128) +                    \
-         sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX                               \
-       (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX                            \
-       ((sizeof(struct bch_btree_ptr_v2) +                     \
-         sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX                                        \
-       (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX     4096
-
-#define BCACHEFS_ROOT_INO      4096
-
-struct bch_inode {
-       struct bch_val          v;
-
-       __le64                  bi_hash_seed;
-       __le32                  bi_flags;
-       __le16                  bi_mode;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
-       struct bch_val          v;
-
-       __le64                  bi_journal_seq;
-       __le64                  bi_hash_seed;
-       __le64                  bi_flags;
-       __le16                  bi_mode;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
-       struct bch_val          v;
-
-       __le64                  bi_journal_seq;
-       __le64                  bi_hash_seed;
-       __le64                  bi_flags;
-       __le64                  bi_sectors;
-       __le64                  bi_size;
-       __le64                  bi_version;
-       __u8                    fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL   6
-#define INODEv3_FIELDS_START_CUR       (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
-       struct bch_val          v;
-
-       __le32                  bi_generation;
-       __le32                  pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2()                  \
-       x(bi_atime,                     96)     \
-       x(bi_ctime,                     96)     \
-       x(bi_mtime,                     96)     \
-       x(bi_otime,                     96)     \
-       x(bi_size,                      64)     \
-       x(bi_sectors,                   64)     \
-       x(bi_uid,                       32)     \
-       x(bi_gid,                       32)     \
-       x(bi_nlink,                     32)     \
-       x(bi_generation,                32)     \
-       x(bi_dev,                       32)     \
-       x(bi_data_checksum,             8)      \
-       x(bi_compression,               8)      \
-       x(bi_project,                   32)     \
-       x(bi_background_compression,    8)      \
-       x(bi_data_replicas,             8)      \
-       x(bi_promote_target,            16)     \
-       x(bi_foreground_target,         16)     \
-       x(bi_background_target,         16)     \
-       x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)     \
-       x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)     \
-       x(bi_subvol,                    32)     \
-       x(bi_parent_subvol,             32)
-
-#define BCH_INODE_FIELDS_v3()                  \
-       x(bi_atime,                     96)     \
-       x(bi_ctime,                     96)     \
-       x(bi_mtime,                     96)     \
-       x(bi_otime,                     96)     \
-       x(bi_uid,                       32)     \
-       x(bi_gid,                       32)     \
-       x(bi_nlink,                     32)     \
-       x(bi_generation,                32)     \
-       x(bi_dev,                       32)     \
-       x(bi_data_checksum,             8)      \
-       x(bi_compression,               8)      \
-       x(bi_project,                   32)     \
-       x(bi_background_compression,    8)      \
-       x(bi_data_replicas,             8)      \
-       x(bi_promote_target,            16)     \
-       x(bi_foreground_target,         16)     \
-       x(bi_background_target,         16)     \
-       x(bi_erasure_code,              16)     \
-       x(bi_fields_set,                16)     \
-       x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)     \
-       x(bi_subvol,                    32)     \
-       x(bi_parent_subvol,             32)     \
-       x(bi_nocow,                     8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS()                       \
-       x(data_checksum,                8)      \
-       x(compression,                  8)      \
-       x(project,                      32)     \
-       x(background_compression,       8)      \
-       x(data_replicas,                8)      \
-       x(promote_target,               16)     \
-       x(foreground_target,            16)     \
-       x(background_target,            16)     \
-       x(erasure_code,                 16)     \
-       x(nocow,                        8)
-
-enum inode_opt_id {
-#define x(name, ...)                           \
-       Inode_opt_##name,
-       BCH_INODE_OPTS()
-#undef  x
-       Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS()                      \
-       x(sync,                         0)      \
-       x(immutable,                    1)      \
-       x(append,                       2)      \
-       x(nodump,                       3)      \
-       x(noatime,                      4)      \
-       x(i_size_dirty,                 5)      \
-       x(i_sectors_dirty,              6)      \
-       x(unlinked,                     7)      \
-       x(backptr_untrusted,            8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n)        BCH_INODE_##t = 1U << n,
-       BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n)        __BCH_INODE_##t = n,
-       BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS,        struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
-                               struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE,     struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
-       struct bch_val          v;
-
-       /* Target inode number: */
-       union {
-       __le64                  d_inum;
-       struct {                /* DT_SUBVOL */
-       __le32                  d_child_subvol;
-       __le32                  d_parent_subvol;
-       };
-       };
-
-       /*
-        * Copy of mode bits 12-15 from the target inode - so userspace can get
-        * the filetype without having to do a stat()
-        */
-       __u8                    d_type;
-
-       __u8                    d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL      16
-#define BCH_DT_MAX     17
-
-#define BCH_NAME_MAX   512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER                      0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS  1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED                   3
-#define KEY_TYPE_XATTR_INDEX_SECURITY          4
-
-struct bch_xattr {
-       struct bch_val          v;
-       __u8                    x_type;
-       __u8                    x_name_len;
-       __le16                  x_val_len;
-       __u8                    x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
-       struct bch_val          v;
-       __u8                    fields;
-       __u8                    gen;
-       __u8                    data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1()                  \
-       x(read_time,            16)             \
-       x(write_time,           16)             \
-       x(data_type,            8)              \
-       x(dirty_sectors,        16)             \
-       x(cached_sectors,       16)             \
-       x(oldest_gen,           8)              \
-       x(stripe,               32)             \
-       x(stripe_redundancy,    8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
-       BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
-       struct bch_val          v;
-       __u8                    nr_fields;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2()                  \
-       x(read_time,            64)             \
-       x(write_time,           64)             \
-       x(dirty_sectors,        32)             \
-       x(cached_sectors,       32)             \
-       x(stripe,               32)             \
-       x(stripe_redundancy,    8)
-
-struct bch_alloc_v3 {
-       struct bch_val          v;
-       __le64                  journal_seq;
-       __le32                  flags;
-       __u8                    nr_fields;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
-
-struct bch_alloc_v4 {
-       struct bch_val          v;
-       __u64                   journal_seq;
-       __u32                   flags;
-       __u8                    gen;
-       __u8                    oldest_gen;
-       __u8                    data_type;
-       __u8                    stripe_redundancy;
-       __u32                   dirty_sectors;
-       __u32                   cached_sectors;
-       __u64                   io_time[2];
-       __u32                   stripe;
-       __u32                   nr_external_backpointers;
-       __u64                   fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0   6
-#define BCH_ALLOC_V4_U64s      (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD,     struct bch_alloc_v4, flags,  0,  1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN,     struct bch_alloc_v4, flags,  1,  2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags,  2,  8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS,  struct bch_alloc_v4, flags,  8,  14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX       40
-
 struct bch_backpointer {
        struct bch_val          v;
        __u8                    btree_id;
@@ -1021,154 +433,6 @@ struct bch_backpointer {
        struct bpos             pos;
 } __packed __aligned(8);
 
-#define KEY_TYPE_BUCKET_GENS_BITS      8
-#define KEY_TYPE_BUCKET_GENS_NR                (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK      (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
-       struct bch_val          v;
-       u8                      gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
-       QTYP_USR                = 0,
-       QTYP_GRP                = 1,
-       QTYP_PRJ                = 2,
-       QTYP_NR                 = 3,
-};
-
-enum quota_counters {
-       Q_SPC                   = 0,
-       Q_INO                   = 1,
-       Q_COUNTERS              = 2,
-};
-
-struct bch_quota_counter {
-       __le64                  hardlimit;
-       __le64                  softlimit;
-};
-
-struct bch_quota {
-       struct bch_val          v;
-       struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
-       struct bch_val          v;
-       __le16                  sectors;
-       __u8                    algorithm;
-       __u8                    nr_blocks;
-       __u8                    nr_redundant;
-
-       __u8                    csum_granularity_bits;
-       __u8                    csum_type;
-       __u8                    pad;
-
-       struct bch_extent_ptr   ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
-       struct bch_val          v;
-       __le64                  idx;
-       /*
-        * A reflink pointer might point to an indirect extent which is then
-        * later split (by copygc or rebalance). If we only pointed to part of
-        * the original indirect extent, and then one of the fragments is
-        * outside the range we point to, we'd leak a refcount: so when creating
-        * reflink pointers, we need to store pad values to remember the full
-        * range we were taking a reference on.
-        */
-       __le32                  front_pad;
-       __le32                  back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
-       struct bch_val          v;
-       __le64                  refcount;
-       union bch_extent_entry  start[0];
-       __u64                   _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
-       struct bch_val          v;
-       __le64                  refcount;
-       u8                      data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
-       struct bch_val          v;
-       u8                      data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN         POS(0, 1)
-#define SUBVOL_POS_MAX         POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL   1
-
-struct bch_subvolume {
-       struct bch_val          v;
-       __le32                  flags;
-       __le32                  snapshot;
-       __le64                  inode;
-       /*
-        * Snapshot subvolumes form a tree, separate from the snapshot nodes
-        * tree - if this subvolume is a snapshot, this is the ID of the
-        * subvolume it was created from:
-        */
-       __le32                  parent;
-       __le32                  pad;
-       bch_le128               otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO,         struct bch_subvolume, flags,  0,  1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,   struct bch_subvolume, flags,  2,  3)
-
-/* Snapshots */
-
-struct bch_snapshot {
-       struct bch_val          v;
-       __le32                  flags;
-       __le32                  parent;
-       __le32                  children[2];
-       __le32                  subvol;
-       /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
-       __le32                  tree;
-       __le32                  depth;
-       __le32                  skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED,     struct bch_snapshot, flags,  0,  1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
-       struct bch_val          v;
-       __le32                  master_subvol;
-       __le32                  root_snapshot;
-};
-
 /* LRU btree: */
 
 struct bch_lru {
@@ -1178,33 +442,6 @@ struct bch_lru {
 
 #define LRU_ID_STRIPES         (1U << 16)
 
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
-       struct bch_val          v;
-       __le32                  subvol;
-       __le32                  pad;
-       __le64                  inum;
-       __le64                  new_i_size;
-};
-
-enum logged_op_finsert_state {
-       LOGGED_OP_FINSERT_start,
-       LOGGED_OP_FINSERT_shift_extents,
-       LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
-       struct bch_val          v;
-       __u8                    state;
-       __u8                    pad[3];
-       __le32                  subvol;
-       __le64                  inum;
-       __le64                  dst_offset;
-       __le64                  src_offset;
-       __le64                  pos;
-};
-
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1230,6 +467,19 @@ struct bch_sb_field {
        x(ext,                          13)     \
        x(downgrade,                    14)
 
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
        BCH_SB_FIELDS()
@@ -1465,23 +715,6 @@ struct bch_sb_field_replicas {
        struct bch_replicas_entry_v1 entries[];
 } __packed __aligned(8);
 
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
-       __le32                          timelimit;
-       __le32                          warnlimit;
-};
-
-struct bch_sb_quota_type {
-       __le64                          flags;
-       struct bch_sb_quota_counter     c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
-       struct bch_sb_field             field;
-       struct bch_sb_quota_type        q[QTYP_NR];
-} __packed __aligned(8);
-
 /* BCH_SB_FIELD_disk_groups: */
 
 #define BCH_SB_LABEL_SIZE              32
@@ -1500,101 +733,6 @@ struct bch_sb_field_disk_groups {
        struct bch_disk_group   entries[];
 } __packed __aligned(8);
 
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS()                              \
-       x(io_read,                                      0)      \
-       x(io_write,                                     1)      \
-       x(io_move,                                      2)      \
-       x(bucket_invalidate,                            3)      \
-       x(bucket_discard,                               4)      \
-       x(bucket_alloc,                                 5)      \
-       x(bucket_alloc_fail,                            6)      \
-       x(btree_cache_scan,                             7)      \
-       x(btree_cache_reap,                             8)      \
-       x(btree_cache_cannibalize,                      9)      \
-       x(btree_cache_cannibalize_lock,                 10)     \
-       x(btree_cache_cannibalize_lock_fail,            11)     \
-       x(btree_cache_cannibalize_unlock,               12)     \
-       x(btree_node_write,                             13)     \
-       x(btree_node_read,                              14)     \
-       x(btree_node_compact,                           15)     \
-       x(btree_node_merge,                             16)     \
-       x(btree_node_split,                             17)     \
-       x(btree_node_rewrite,                           18)     \
-       x(btree_node_alloc,                             19)     \
-       x(btree_node_free,                              20)     \
-       x(btree_node_set_root,                          21)     \
-       x(btree_path_relock_fail,                       22)     \
-       x(btree_path_upgrade_fail,                      23)     \
-       x(btree_reserve_get_fail,                       24)     \
-       x(journal_entry_full,                           25)     \
-       x(journal_full,                                 26)     \
-       x(journal_reclaim_finish,                       27)     \
-       x(journal_reclaim_start,                        28)     \
-       x(journal_write,                                29)     \
-       x(read_promote,                                 30)     \
-       x(read_bounce,                                  31)     \
-       x(read_split,                                   33)     \
-       x(read_retry,                                   32)     \
-       x(read_reuse_race,                              34)     \
-       x(move_extent_read,                             35)     \
-       x(move_extent_write,                            36)     \
-       x(move_extent_finish,                           37)     \
-       x(move_extent_fail,                             38)     \
-       x(move_extent_start_fail,                       39)     \
-       x(copygc,                                       40)     \
-       x(copygc_wait,                                  41)     \
-       x(gc_gens_end,                                  42)     \
-       x(gc_gens_start,                                43)     \
-       x(trans_blocked_journal_reclaim,                44)     \
-       x(trans_restart_btree_node_reused,              45)     \
-       x(trans_restart_btree_node_split,               46)     \
-       x(trans_restart_fault_inject,                   47)     \
-       x(trans_restart_iter_upgrade,                   48)     \
-       x(trans_restart_journal_preres_get,             49)     \
-       x(trans_restart_journal_reclaim,                50)     \
-       x(trans_restart_journal_res_get,                51)     \
-       x(trans_restart_key_cache_key_realloced,        52)     \
-       x(trans_restart_key_cache_raced,                53)     \
-       x(trans_restart_mark_replicas,                  54)     \
-       x(trans_restart_mem_realloced,                  55)     \
-       x(trans_restart_memory_allocation_failure,      56)     \
-       x(trans_restart_relock,                         57)     \
-       x(trans_restart_relock_after_fill,              58)     \
-       x(trans_restart_relock_key_cache_fill,          59)     \
-       x(trans_restart_relock_next_node,               60)     \
-       x(trans_restart_relock_parent_for_fill,         61)     \
-       x(trans_restart_relock_path,                    62)     \
-       x(trans_restart_relock_path_intent,             63)     \
-       x(trans_restart_too_many_iters,                 64)     \
-       x(trans_restart_traverse,                       65)     \
-       x(trans_restart_upgrade,                        66)     \
-       x(trans_restart_would_deadlock,                 67)     \
-       x(trans_restart_would_deadlock_write,           68)     \
-       x(trans_restart_injected,                       69)     \
-       x(trans_restart_key_cache_upgrade,              70)     \
-       x(trans_traverse_all,                           71)     \
-       x(transaction_commit,                           72)     \
-       x(write_super,                                  73)     \
-       x(trans_restart_would_deadlock_recursion_limit, 74)     \
-       x(trans_restart_write_buffer_flush,             75)     \
-       x(trans_restart_split_race,                     76)     \
-       x(write_buffer_flush_slowpath,                  77)     \
-       x(write_buffer_flush_sync,                      78)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
-       BCH_PERSISTENT_COUNTERS()
-#undef x
-       BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
-       struct bch_sb_field     field;
-       __le64                  d[];
-};
-
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
  * the superblock:
index abdb05507d162c7c06bb89ce96bf67f6484207a7..76e79a15ba08fb23ed9d0560dcd5966fe68ce92a 100644 (file)
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
                        next_key_bits -= 64;
                }
 
-               bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+               bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
 
                if (!next_key_bits)
                        break;
index 761f5e33b1e69e94ca0aaaa41a9825e496b5840f..5e52684764eb14de4d8433abd5954a829648440b 100644 (file)
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+                                   struct bkey_s_c k)
+{
+       struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+       prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
 #define bch2_bkey_ops_cookie ((struct bkey_ops) {      \
        .key_invalid    = key_type_cookie_invalid,      \
+       .val_to_text    = key_type_cookie_to_text,      \
        .min_val_size   = 8,                            \
 })
 
index ee82283722b759bbce174b2d902403c0024fe574..03efe8ee565a90672367c2146e3ff44ceb0db526 100644 (file)
@@ -83,9 +83,10 @@ enum btree_update_flags {
 
        __BTREE_TRIGGER_NORUN,
        __BTREE_TRIGGER_TRANSACTIONAL,
+       __BTREE_TRIGGER_ATOMIC,
+       __BTREE_TRIGGER_GC,
        __BTREE_TRIGGER_INSERT,
        __BTREE_TRIGGER_OVERWRITE,
-       __BTREE_TRIGGER_GC,
        __BTREE_TRIGGER_BUCKET_INVALIDATE,
 };
 
@@ -107,6 +108,10 @@ enum btree_update_flags {
  * causing us to go emergency read-only)
  */
 #define BTREE_TRIGGER_TRANSACTIONAL    (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC           (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
 
 /* @new is entering the btree */
 #define BTREE_TRIGGER_INSERT           (1U << __BTREE_TRIGGER_INSERT)
@@ -114,9 +119,6 @@ enum btree_update_flags {
 /* @old is leaving the btree */
 #define BTREE_TRIGGER_OVERWRITE                (1U << __BTREE_TRIGGER_OVERWRITE)
 
-/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
-#define BTREE_TRIGGER_GC               (1U << __BTREE_TRIGGER_GC)
-
 /* signal from bucket invalidate path to alloc trigger */
 #define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
 
index 74bf8eb90a4c42cd24dc61024ecb448740e271a7..3fd1085b6c61ee72e7e814cf722306ebdba057c4 100644 (file)
@@ -720,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
 {
        struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
        struct bkey_i min_key, max_key;
-       unsigned j, cacheline = 1;
+       unsigned cacheline = 1;
 
        t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
                      bset_ro_tree_capacity(b, t));
@@ -823,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
        set_btree_bset(b, t, i);
 }
 
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
-                        struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
 {
        struct bset *i = &bne->keys;
        struct bset_tree *t;
 
-       BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+       BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
        BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
        BUG_ON(b->nsets >= MAX_BSETS);
 
index 632c2b8c54609b4be37f11e18868e4c41dcb736b..79c77baaa383868c99660a78a656c73d187f996f 100644 (file)
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
 void bch2_btree_keys_init(struct btree *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
-                        struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
 
 void bch2_bset_insert(struct btree *, struct btree_node_iter *,
index 8e2488a4b58d00a45f78a7c64a6c1e83f4b0ff59..d7c81beac14afae7ee44f11f28eb424f1b54a063 100644 (file)
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
        clear_btree_node_just_written(b);
 
-       kvpfree(b->data, btree_bytes(c));
+       kvpfree(b->data, btree_buf_bytes(b));
        b->data = NULL;
 #ifdef __KERNEL__
        kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
        BUG_ON(b->data || b->aux_data);
 
-       b->data = kvpmalloc(btree_bytes(c), gfp);
+       b->data = kvpmalloc(btree_buf_bytes(b), gfp);
        if (!b->data)
                return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
                b->aux_data = NULL;
 #endif
        if (!b->aux_data) {
-               kvpfree(b->data, btree_bytes(c));
+               kvpfree(b->data, btree_buf_bytes(b));
                b->data = NULL;
                return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
        }
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
        bkey_btree_ptr_init(&b->key);
        INIT_LIST_HEAD(&b->list);
        INIT_LIST_HEAD(&b->write_blocked);
-       b->byte_order = ilog2(btree_bytes(c));
+       b->byte_order = ilog2(c->opts.btree_node_size);
        return b;
 }
 
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        if (c->verify_data)
                list_move(&c->verify_data->list, &bc->live);
 
-       kvpfree(c->verify_ondisk, btree_bytes(c));
+       kvpfree(c->verify_ondisk, c->opts.btree_node_size);
 
        for (i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
@@ -1192,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
               "    failed unpacked %zu\n",
               b->unpack_fn_len,
               b->nr.live_u64s * sizeof(u64),
-              btree_bytes(c) - sizeof(struct btree_node),
+              btree_buf_bytes(b) - sizeof(struct btree_node),
               b->nr.live_u64s * 100 / btree_max_u64s(c),
               b->sib_u64s[0],
               b->sib_u64s[1],
index 4e1af58820522fc8feec3caf9afc34d12f76c772..6d33885fdbde0d101b4c5785a1bf57bf072fe8de 100644 (file)
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
             _iter = 0; _iter < (_tbl)->size; _iter++)                  \
                rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
 
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
 {
-       return c->opts.btree_node_size;
+       return 1UL << b->byte_order;
 }
 
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
 {
-       return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+       return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
 }
 
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
 {
-       return btree_bytes(c) / PAGE_SIZE;
+       return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
 }
 
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+       return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
 {
        return btree_sectors(c) >> c->block_bits;
 }
index 49b4ade758c3623ed35557a02a00afd31b0bec52..1102995643b137c3a8a9fe5f12f0cce95edfafeb 100644 (file)
@@ -597,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                              "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                             bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                             bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
                              p.ptr.gen,
                              (printbuf_reset(&buf),
                               bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -615,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                              "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                             bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                             bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
                              p.ptr.gen, g->gen,
                              (printbuf_reset(&buf),
                               bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -637,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                              "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-                             bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                             bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
                              p.ptr.gen,
                              (printbuf_reset(&buf),
                               bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -649,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                              "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                             bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                             bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
                              p.ptr.gen, g->gen,
                              (printbuf_reset(&buf),
                               bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -664,8 +664,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                                "bucket %u:%zu different types of data in same bucket: %s, %s\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                               bch2_data_types[g->data_type],
-                               bch2_data_types[data_type],
+                               bch2_data_type_str(g->data_type),
+                               bch2_data_type_str(data_type),
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (data_type == BCH_DATA_btree) {
@@ -1238,11 +1238,11 @@ static int bch2_gc_done(struct bch_fs *c,
 
                for (i = 0; i < BCH_DATA_NR; i++) {
                        copy_dev_field(dev_usage_buckets_wrong,
-                                      d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                                      d[i].buckets,    "%s buckets", bch2_data_type_str(i));
                        copy_dev_field(dev_usage_sectors_wrong,
-                                      d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                                      d[i].sectors,    "%s sectors", bch2_data_type_str(i));
                        copy_dev_field(dev_usage_fragmented_wrong,
-                                      d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+                                      d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
                }
        }
 
@@ -1253,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
                        bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
 
                copy_fs_field(fs_usage_hidden_wrong,
-                             hidden,           "hidden");
+                             b.hidden,         "hidden");
                copy_fs_field(fs_usage_btree_wrong,
-                             btree,            "btree");
+                             b.btree,          "btree");
 
                if (!metadata_only) {
                        copy_fs_field(fs_usage_data_wrong,
-                                     data,     "data");
+                                     b.data,   "data");
                        copy_fs_field(fs_usage_cached_wrong,
-                                     cached,   "cached");
+                                     b.cached, "cached");
                        copy_fs_field(fs_usage_reserved_wrong,
-                                     reserved, "reserved");
+                                     b.reserved,       "reserved");
                        copy_fs_field(fs_usage_nr_inodes_wrong,
-                                     nr_inodes,"nr_inodes");
+                                     b.nr_inodes,"nr_inodes");
 
                        for (i = 0; i < BCH_REPLICAS_MAX; i++)
                                copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1417,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
                        ": got %s, should be %s",
                        iter->pos.inode, iter->pos.offset,
                        gc.gen,
-                       bch2_data_types[new.data_type],
-                       bch2_data_types[gc.data_type]))
+                       bch2_data_type_str(new.data_type),
+                       bch2_data_type_str(gc.data_type)))
                new.data_type = gc.data_type;
 
 #define copy_bucket_field(_errtype, _f)                                        \
@@ -1428,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
                        ": got %u, should be %u",                       \
                        iter->pos.inode, iter->pos.offset,              \
                        gc.gen,                                         \
-                       bch2_data_types[gc.data_type],                  \
+                       bch2_data_type_str(gc.data_type),               \
                        new._f, gc._f))                                 \
                new._f = gc._f;                                         \
 
index 33db48e2153fef61f0c733f97278018f419c2b05..aa9b6cbe3226909626411b886731a8bb8648a558 100644 (file)
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
        unsigned flags = memalloc_nofs_save();
        void *p;
 
-       BUG_ON(size > btree_bytes(c));
+       BUG_ON(size > c->opts.btree_node_size);
 
        *used_mempool = false;
        p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
 
        ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
 
-       for (k = unwritten_whiteouts_start(c, b);
-            k != unwritten_whiteouts_end(c, b);
+       for (k = unwritten_whiteouts_start(b);
+            k != unwritten_whiteouts_end(b);
             k = bkey_p_next(k))
                *--ptrs = k;
 
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
        verify_no_dups(b, new_whiteouts,
                       (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
 
-       memcpy_u64s(unwritten_whiteouts_start(c, b),
+       memcpy_u64s(unwritten_whiteouts_start(b),
                    new_whiteouts, b->whiteout_u64s);
 
        btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        }
 
        bytes = sorting_entire_node
-               ? btree_bytes(c)
+               ? btree_buf_bytes(b)
                : __vstruct_bytes(struct btree_node, u64s);
 
        out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        if (sorting_entire_node) {
                u64s = le16_to_cpu(out->keys.u64s);
 
-               BUG_ON(bytes != btree_bytes(c));
+               BUG_ON(bytes != btree_buf_bytes(b));
 
                /*
                 * Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 
        bne = want_new_bset(c, b);
        if (bne)
-               bch2_bset_init_next(c, b, bne);
+               bch2_bset_init_next(b, bne);
 
        bch2_btree_build_aux_trees(b);
 
@@ -1160,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                             ptr_written, b->written);
        } else {
                for (bne = write_block(b);
-                    bset_byte_offset(b, bne) < btree_bytes(c);
+                    bset_byte_offset(b, bne) < btree_buf_bytes(b);
                     bne = (void *) bne + block_bytes(c))
                        btree_err_on(bne->keys.seq == b->data->keys.seq &&
                                     !bch2_journal_seq_is_blacklisted(c,
@@ -1172,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                     "found bset signature after last bset");
        }
 
-       sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+       sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
        sorted->keys.u64s = 0;
 
        set_btree_bset(b, b->set, &b->data->keys);
@@ -1188,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
        BUG_ON(b->nr.live_u64s != u64s);
 
-       btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+       btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
 
        if (updated_range)
                bch2_btree_node_drop_keys_outside_node(b);
@@ -1284,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
                rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
                bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
                bio->bi_iter.bi_sector  = rb->pick.ptr.offset;
-               bio->bi_iter.bi_size    = btree_bytes(c);
+               bio->bi_iter.bi_size    = btree_buf_bytes(b);
 
                if (rb->have_ioref) {
                        bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1512,7 +1512,7 @@ fsck_err:
        }
 
        if (best >= 0) {
-               memcpy(b->data, ra->buf[best], btree_bytes(c));
+               memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
                ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
        } else {
                ret = -1;
@@ -1578,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
        for (i = 0; i < ra->nr; i++) {
                ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
                ra->bio[i] = bio_alloc_bioset(NULL,
-                                             buf_pages(ra->buf[i], btree_bytes(c)),
+                                             buf_pages(ra->buf[i], btree_buf_bytes(b)),
                                              REQ_OP_READ|REQ_SYNC|REQ_META,
                                              GFP_NOFS,
                                              &c->btree_bio);
@@ -1598,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
                rb->pick                = pick;
                rb->bio.bi_iter.bi_sector = pick.ptr.offset;
                rb->bio.bi_end_io       = btree_node_read_all_replicas_endio;
-               bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+               bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
 
                if (rb->have_ioref) {
                        this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1665,7 +1665,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
        ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 
        bio = bio_alloc_bioset(NULL,
-                              buf_pages(b->data, btree_bytes(c)),
+                              buf_pages(b->data, btree_buf_bytes(b)),
                               REQ_OP_READ|REQ_SYNC|REQ_META,
                               GFP_NOFS,
                               &c->btree_bio);
@@ -1679,7 +1679,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
        INIT_WORK(&rb->work, btree_node_read_work);
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bio->bi_end_io          = btree_node_read_endio;
-       bch2_bio_map(bio, b->data, btree_bytes(c));
+       bch2_bio_map(bio, b->data, btree_buf_bytes(b));
 
        if (rb->have_ioref) {
                this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -2074,8 +2074,8 @@ do_write:
        i->u64s         = 0;
 
        sort_iter_add(&sort_iter.iter,
-                     unwritten_whiteouts_start(c, b),
-                     unwritten_whiteouts_end(c, b));
+                     unwritten_whiteouts_start(b),
+                     unwritten_whiteouts_end(b));
        SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
        b->whiteout_u64s = 0;
@@ -2251,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 
        bne = want_new_bset(c, b);
        if (bne)
-               bch2_bset_init_next(c, b, bne);
+               bch2_bset_init_next(b, bne);
 
        bch2_btree_build_aux_trees(b);
 
index fa298289e01656b989db38dcf19301ae4d880bb7..5467a8635be113102c56bb6f02986209533c35ac 100644 (file)
@@ -1337,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in
 
        if (path->should_be_locked &&
            !trans->restarted &&
-           (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+           (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
                return;
 
        if (dup) {
index da2b74fa63fcece86d7d92d18dc340330180c657..24772538e4cc74ada59851bd7847dd5ece5ea122 100644 (file)
@@ -819,6 +819,11 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
        for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
 
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
 #define drop_locks_do(_trans, _do)                                     \
 ({                                                                     \
        bch2_trans_unlock(_trans);                                      \
index 2d1c95c42f240cc88b31c2728d7a970560e4865a..6843974423381029e7a8cf24fd4cd5c6c33627cd 100644 (file)
@@ -92,7 +92,7 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
                        continue;
 
                bch2_btree_trans_to_text(out, i->trans);
-               bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1);
+               bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
        }
 }
 
@@ -227,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
                        prt_printf(&buf, "backtrace:");
                        prt_newline(&buf);
                        printbuf_indent_add(&buf, 2);
-                       bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2);
+                       bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
                        printbuf_indent_sub(&buf, 2);
                        prt_newline(&buf);
                }
@@ -631,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
 }
 
 __flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
-                       struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
 {
        struct get_locks_fail f;
 
@@ -642,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
 int __bch2_btree_path_relock(struct btree_trans *trans,
                        struct btree_path *path, unsigned long trace_ip)
 {
-       if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+       if (!bch2_btree_path_relock_norestart(trans, path)) {
                trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
                return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
        }
@@ -759,12 +758,39 @@ int bch2_trans_relock(struct btree_trans *trans)
        if (unlikely(trans->restarted))
                return -((int) trans->restarted);
 
-       trans_for_each_path(trans, path, i)
+       trans_for_each_path(trans, path, i) {
+               struct get_locks_fail f;
+
                if (path->should_be_locked &&
-                   !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
-                       trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+                   !btree_path_get_locks(trans, path, false, &f)) {
+                       if (trace_trans_restart_relock_enabled()) {
+                               struct printbuf buf = PRINTBUF;
+
+                               bch2_bpos_to_text(&buf, path->pos);
+                               prt_printf(&buf, " l=%u seq=%u node seq=",
+                                          f.l, path->l[f.l].lock_seq);
+                               if (IS_ERR_OR_NULL(f.b)) {
+                                       prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+                               } else {
+                                       prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+                                       struct six_lock_count c =
+                                               bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+                                       prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+                                       c = six_lock_counts(&f.b->c.lock);
+                                       prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+                               }
+
+                               trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+                               printbuf_exit(&buf);
+                       }
+
+                       count_event(trans->c, trans_restart_relock);
                        return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
                }
+       }
+
        return 0;
 }
 
@@ -778,7 +804,7 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
 
        trans_for_each_path(trans, path, i)
                if (path->should_be_locked &&
-                   !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+                   !bch2_btree_path_relock_norestart(trans, path)) {
                        return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
                }
        return 0;
index cc5500a957a1b3084d005abe8b0893146e354bca..4bd72c855da1a4028106b70e10727ad07d578614 100644 (file)
@@ -312,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
 
 /* relock: */
 
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
-                                     struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
 int __bch2_btree_path_relock(struct btree_trans *,
                             struct btree_path *, unsigned long);
 
@@ -353,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
 
 /* upgrade */
 
-
-struct get_locks_fail {
-       unsigned        l;
-       struct btree    *b;
-};
-
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
                               struct btree_path *, unsigned,
                               struct get_locks_fail *);
index 80505554498cf96697cd8c8108207a92e854a8aa..30d69a6d133eec77c76c7e64a5de0d896ad6b732 100644 (file)
@@ -139,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
        EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
        EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
        EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-       EBUG_ON(insert->k.u64s >
-               bch_btree_keys_u64s_remaining(trans->c, b));
+       EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
        EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
 
        k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -160,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
                k->type = KEY_TYPE_deleted;
 
                if (k->needs_whiteout)
-                       push_whiteout(trans->c, b, insert->k.p);
+                       push_whiteout(b, insert->k.p);
                k->needs_whiteout = false;
 
                if (k >= btree_bset_last(b)->start) {
@@ -348,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
 static inline int btree_key_can_insert(struct btree_trans *trans,
                                       struct btree *b, unsigned u64s)
 {
-       struct bch_fs *c = trans->c;
-
-       if (!bch2_btree_node_insert_fits(c, b, u64s))
+       if (!bch2_btree_node_insert_fits(b, u64s))
                return -BCH_ERR_btree_insert_btree_node_full;
 
        return 0;
@@ -418,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
                return 0;
 
        new_u64s        = roundup_pow_of_two(u64s);
-       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
        if (unlikely(!new_k))
                return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
 
@@ -448,9 +445,6 @@ static int run_one_mem_trigger(struct btree_trans *trans,
        if (unlikely(flags & BTREE_TRIGGER_NORUN))
                return 0;
 
-       if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
-               return 0;
-
        if (old_ops->trigger == new_ops->trigger) {
                ret   = bch2_key_trigger(trans, i->btree_id, i->level,
                                old, bkey_i_to_s(new),
@@ -586,9 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 
 static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
 {
-       struct bch_fs *c = trans->c;
-       int ret = 0;
-
        trans_for_each_update(trans, i) {
                /*
                 * XXX: synchronization of cached update triggers with gc
@@ -596,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
                 */
                BUG_ON(i->cached || i->level);
 
-               if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) {
-                       ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+               if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+                   gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+                       int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
                        if (ret)
-                               break;
+                               return ret;
                }
        }
 
-       return ret;
+       return 0;
 }
 
 static inline int
@@ -680,6 +672,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
            bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
                return -BCH_ERR_btree_insert_need_mark_replicas;
 
+       /* XXX: we only want to run this if deltas are nonzero */
+       bch2_trans_account_disk_usage_change(trans);
+
        h = trans->hooks;
        while (h) {
                ret = h->fn(trans, h);
@@ -689,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
        }
 
        trans_for_each_update(trans, i)
-               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-                       ret = run_one_mem_trigger(trans, i, i->flags);
+               if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+                       ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
                        if (ret)
                                goto fatal_err;
                }
@@ -994,6 +989,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
            !trans->journal_entries_u64s)
                goto out_reset;
 
+       memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+
        ret = bch2_trans_commit_run_triggers(trans);
        if (ret)
                goto out_reset;
@@ -1018,9 +1015,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
        for (struct jset_entry *i = trans->journal_entries;
             i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
             i = vstruct_next(i)) {
-               if (!jset_entry_is_key(i))
-                       continue;
-
                enum bkey_invalid_flags invalid_flags = 0;
 
                if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
index d530307046f4cf93bdb4c4063409a9fff5e705c4..4a5a64499eb76698743ae7f20b4e47eaca09b868 100644 (file)
@@ -430,6 +430,9 @@ struct btree_trans {
        struct journal_res      journal_res;
        u64                     *journal_seq;
        struct disk_reservation *disk_res;
+
+       struct bch_fs_usage_base fs_usage_delta;
+
        unsigned                journal_u64s;
        unsigned                extra_disk_res; /* XXX kill */
        struct replicas_delta_list *fs_usage_deltas;
@@ -653,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
         BIT_ULL(BKEY_TYPE_reflink)|                    \
         BIT_ULL(BKEY_TYPE_btree))
 
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS            \
        (BIT_ULL(BKEY_TYPE_alloc)|                      \
         BIT_ULL(BKEY_TYPE_inodes)|                     \
         BIT_ULL(BKEY_TYPE_stripes)|                    \
@@ -661,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
-        BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+        BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
@@ -738,4 +741,9 @@ enum btree_node_sibling {
        btree_next_sib,
 };
 
+struct get_locks_fail {
+       unsigned        l;
+       struct btree    *b;
+};
+
 #endif /* _BCACHEFS_BTREE_TYPES_H */
index 44f9dfa28a09d89984150b19d3831077a18485f1..17a5938aa71a6b43b45c12383e4690df146ee2a3 100644 (file)
@@ -159,7 +159,7 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
 {
        size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
 
-       return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+       return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
 }
 
 /* Btree node freeing/allocation: */
@@ -1097,7 +1097,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
                 * Always check for space for two keys, even if we won't have to
                 * split at prior level - it might have been a merge instead:
                 */
-               if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+               if (bch2_btree_node_insert_fits(path->l[update_level].b,
                                                BKEY_BTREE_PTR_U64s_MAX * 2))
                        break;
 
@@ -1401,7 +1401,7 @@ static void __btree_split_node(struct btree_update *as,
 
                unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
                        nr_keys[i].val_u64s;
-               if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+               if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
                        n[i]->data->format = b->format;
 
                btree_node_set_format(n[i], n[i]->data->format);
@@ -1703,7 +1703,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 
        bch2_btree_node_prep_for_write(trans, path, b);
 
-       if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+       if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
                bch2_btree_node_unlock_write(trans, path, b);
                goto split;
        }
index adfc62083844cf3b93d16d25d8269564f5b022a3..c593c925d1e3b03cfae5b4e7fdf0f7bc4b99df5c 100644 (file)
@@ -184,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
        b->sib_u64s[1] = b->nr.live_u64s;
 }
 
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
 {
-       return (void *) b->data + btree_bytes(c);
+       return (void *) b->data + btree_buf_bytes(b);
 }
 
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
-                                                           struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
 {
-       return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+       return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
 }
 
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
-                                                         struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
 {
-       return btree_data_end(c, b);
+       return btree_data_end(b);
 }
 
 static inline void *write_block(struct btree *b)
@@ -221,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
        return __btree_addr_written(b, k);
 }
 
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
-                                                struct btree *b,
-                                                void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
 {
        ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
                b->whiteout_u64s;
-       ssize_t total = c->opts.btree_node_size >> 3;
+       ssize_t total = btree_buf_bytes(b) >> 3;
 
        /* Always leave one extra u64 for bch2_varint_decode: */
        used++;
@@ -235,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
        return total - used;
 }
 
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
-                                                  struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
 {
-       ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+       ssize_t remaining = __bch2_btree_u64s_remaining(b,
                                btree_bkey_last(b, bset_tree_last(b)));
 
        BUG_ON(remaining < 0);
@@ -260,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
        return 8 << BTREE_WRITE_SET_U64s_BITS;
 }
 
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
-                                                    struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
 {
        struct bset_tree *t = bset_tree_last(b);
        struct btree_node_entry *bne = max(write_block(b),
                        (void *) btree_bkey_last(b, bset_tree_last(b)));
        ssize_t remaining_space =
-               __bch_btree_u64s_remaining(c, b, bne->keys.start);
+               __bch2_btree_u64s_remaining(b, bne->keys.start);
 
        if (unlikely(bset_written(b, bset(b, t)))) {
                if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -281,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
        return NULL;
 }
 
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
-                                struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
 {
        struct bkey_packed k;
 
-       BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+       BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
        EBUG_ON(btree_node_just_written(b));
 
        if (!bkey_pack_pos(&k, pos, b)) {
@@ -299,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
        k.needs_whiteout = true;
 
        b->whiteout_u64s += k.u64s;
-       bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+       bkey_p_copy(unwritten_whiteouts_start(b), &k);
 }
 
 /*
  * write lock must be held on @b (else the dirty bset that we were going to
  * insert into could be written out from under us)
  */
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
-                                              struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
 {
        if (unlikely(btree_node_need_rewrite(b)))
                return false;
 
-       return u64s <= bch_btree_keys_u64s_remaining(c, b);
+       return u64s <= bch2_btree_keys_u64s_remaining(b);
 }
 
 void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
index 5c1169c78dafec7bf238854a74b37120f1c835cd..ac7844861966368cdce41efd9e27c898fe8ad6e7 100644 (file)
@@ -125,13 +125,12 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
                               struct btree_write_buffered_key *wb,
                               bool *write_locked, size_t *fast)
 {
-       struct bch_fs *c = trans->c;
        struct btree_path *path;
        int ret;
 
        EBUG_ON(!wb->journal_seq);
-       EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
-       EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+       EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+       EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
 
        ret = bch2_btree_iter_traverse(iter);
        if (ret)
@@ -155,7 +154,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
                *write_locked = true;
        }
 
-       if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
+       if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
                *write_locked = false;
                return wb_flush_one_slowpath(trans, iter, wb);
        }
index 67b7e79648b15b4629b9c664eb35a84327be21c0..54f7826ac49874d46b08330678ea0b2565ecc491 100644 (file)
@@ -25,7 +25,7 @@
 
 #include <linux/preempt.h>
 
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
                                              enum bch_data_type data_type,
                                              s64 sectors)
 {
@@ -54,20 +54,20 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                bch2_fs_usage_acc_to_base(c, i);
 
        for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
-               usage->reserved += usage->persistent_reserved[i];
+               usage->b.reserved += usage->persistent_reserved[i];
 
        for (unsigned i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(&c->replicas, i);
 
-               fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+               fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
        }
 
        for_each_member_device(c, ca) {
                struct bch_dev_usage dev = bch2_dev_usage_read(ca);
 
-               usage->hidden += (dev.d[BCH_DATA_sb].buckets +
-                                 dev.d[BCH_DATA_journal].buckets) *
+               usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+                                   dev.d[BCH_DATA_journal].buckets) *
                        ca->mi.bucket_size;
        }
 
@@ -188,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
        prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
 
        prt_printf(out, "hidden:\t\t\t\t%llu\n",
-              fs_usage->u.hidden);
+              fs_usage->u.b.hidden);
        prt_printf(out, "data:\t\t\t\t%llu\n",
-              fs_usage->u.data);
+              fs_usage->u.b.data);
        prt_printf(out, "cached:\t\t\t\t%llu\n",
-              fs_usage->u.cached);
+              fs_usage->u.b.cached);
        prt_printf(out, "reserved:\t\t\t%llu\n",
-              fs_usage->u.reserved);
+              fs_usage->u.b.reserved);
        prt_printf(out, "nr_inodes:\t\t\t%llu\n",
-              fs_usage->u.nr_inodes);
+              fs_usage->u.b.nr_inodes);
        prt_printf(out, "online reserved:\t\t%llu\n",
               fs_usage->online_reserved);
 
@@ -225,10 +225,10 @@ static u64 reserve_factor(u64 r)
 
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
 {
-       return min(fs_usage->u.hidden +
-                  fs_usage->u.btree +
-                  fs_usage->u.data +
-                  reserve_factor(fs_usage->u.reserved +
+       return min(fs_usage->u.b.hidden +
+                  fs_usage->u.b.btree +
+                  fs_usage->u.b.data +
+                  reserve_factor(fs_usage->u.b.reserved +
                                  fs_usage->online_reserved),
                   c->capacity);
 }
@@ -240,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
        u64 data, reserved;
 
        ret.capacity = c->capacity -
-               bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+               bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
 
-       data            = bch2_fs_usage_read_one(c, &c->usage_base->data) +
-               bch2_fs_usage_read_one(c, &c->usage_base->btree);
-       reserved        = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+       data            = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+               bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+       reserved        = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
                percpu_u64_get(c->online_reserved);
 
        ret.used        = min(ret.capacity, data + reserve_factor(reserved));
        ret.free        = ret.capacity - ret.used;
 
-       ret.nr_inodes   = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+       ret.nr_inodes   = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
 
        return ret;
 }
@@ -284,7 +284,7 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
        prt_newline(out);
 
        for (unsigned i = 0; i < BCH_DATA_NR; i++) {
-               prt_str(out, bch2_data_types[i]);
+               bch2_prt_data_type(out, i);
                prt_tab(out);
                prt_u64(out, usage->d[i].buckets);
                prt_tab_rjust(out);
@@ -308,9 +308,9 @@ void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
        fs_usage = fs_usage_ptr(c, journal_seq, gc);
 
        if (data_type_is_hidden(old->data_type))
-               fs_usage->hidden -= ca->mi.bucket_size;
+               fs_usage->b.hidden -= ca->mi.bucket_size;
        if (data_type_is_hidden(new->data_type))
-               fs_usage->hidden += ca->mi.bucket_size;
+               fs_usage->b.hidden += ca->mi.bucket_size;
 
        u = dev_usage_ptr(ca, journal_seq, gc);
 
@@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
        if (idx < 0)
                return -1;
 
-       fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+       fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
        return 0;
 }
@@ -394,7 +394,7 @@ int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
 
        preempt_disable();
        fs_usage = fs_usage_ptr(c, journal_seq, gc);
-       fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+       fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
        preempt_enable();
 err:
@@ -523,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
        if (bch2_fs_inconsistent_on(g->data_type &&
                        g->data_type != data_type, c,
                        "different types of data in same bucket: %s, %s",
-                       bch2_data_types[g->data_type],
-                       bch2_data_types[data_type])) {
+                       bch2_data_type_str(g->data_type),
+                       bch2_data_type_str(data_type))) {
                ret = -EIO;
                goto err;
        }
@@ -532,7 +532,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
        if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
                        ca->dev_idx, b, g->gen,
-                       bch2_data_types[g->data_type ?: data_type],
+                       bch2_data_type_str(g->data_type ?: data_type),
                        g->dirty_sectors, sectors)) {
                ret = -EIO;
                goto err;
@@ -575,7 +575,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
-                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
                        ptr->gen,
                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
                ret = -EIO;
@@ -588,7 +588,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
-                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
                        ptr->gen,
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -603,7 +603,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
                        *bucket_gen(ca, bucket_nr),
-                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
                        ptr->gen,
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -624,8 +624,8 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
                        "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
-                       bch2_data_types[bucket_data_type],
-                       bch2_data_types[ptr_data_type],
+                       bch2_data_type_str(bucket_data_type),
+                       bch2_data_type_str(ptr_data_type),
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, k), buf.buf));
                ret = -EIO;
@@ -638,7 +638,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
-                       bch2_data_types[bucket_data_type ?: ptr_data_type],
+                       bch2_data_type_str(bucket_data_type ?: ptr_data_type),
                        bucket_sectors, sectors,
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -677,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
                BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
        }
 
-       dst->nr_inodes -= deltas->nr_inodes;
+       dst->b.nr_inodes -= deltas->nr_inodes;
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                added                           -= deltas->persistent_reserved[i];
-               dst->reserved                   -= deltas->persistent_reserved[i];
+               dst->b.reserved                 -= deltas->persistent_reserved[i];
                dst->persistent_reserved[i]     -= deltas->persistent_reserved[i];
        }
 
@@ -694,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
        percpu_up_read(&c->mark_lock);
 }
 
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
-                             struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
+       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
        static int warned_disk_usage = 0;
        bool warn = false;
-       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-       struct replicas_delta *d, *d2;
-       struct replicas_delta *top = (void *) deltas->d + deltas->used;
-       struct bch_fs_usage *dst;
-       s64 added = 0, should_not_have_added;
-       unsigned i;
 
        percpu_down_read(&c->mark_lock);
        preempt_disable();
-       dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+       struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+       struct bch_fs_usage_base *src = &trans->fs_usage_delta;
 
-       for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
-               switch (d->r.data_type) {
-               case BCH_DATA_btree:
-               case BCH_DATA_user:
-               case BCH_DATA_parity:
-                       added += d->delta;
-               }
-
-               if (__update_replicas(c, dst, &d->r, d->delta))
-                       goto need_mark;
-       }
-
-       dst->nr_inodes += deltas->nr_inodes;
-
-       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-               added                           += deltas->persistent_reserved[i];
-               dst->reserved                   += deltas->persistent_reserved[i];
-               dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
-       }
+       s64 added = src->btree + src->data + src->reserved;
 
        /*
         * Not allowed to reduce sectors_available except by getting a
         * reservation:
         */
-       should_not_have_added = added - (s64) disk_res_sectors;
+       s64 should_not_have_added = added - (s64) disk_res_sectors;
        if (unlikely(should_not_have_added > 0)) {
                u64 old, new, v = atomic64_read(&c->sectors_available);
 
@@ -754,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
                this_cpu_sub(*c->online_reserved, added);
        }
 
+       dst->hidden     += src->hidden;
+       dst->btree      += src->btree;
+       dst->data       += src->data;
+       dst->cached     += src->cached;
+       dst->reserved   += src->reserved;
+       dst->nr_inodes  += src->nr_inodes;
+
        preempt_enable();
        percpu_up_read(&c->mark_lock);
 
@@ -761,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
                bch2_trans_inconsistent(trans,
                                        "disk usage increased %lli more than %llu sectors reserved)",
                                        should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+                             struct replicas_delta_list *deltas)
+{
+       struct bch_fs *c = trans->c;
+       struct replicas_delta *d, *d2;
+       struct replicas_delta *top = (void *) deltas->d + deltas->used;
+       struct bch_fs_usage *dst;
+       unsigned i;
+
+       percpu_down_read(&c->mark_lock);
+       preempt_disable();
+       dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+       for (d = deltas->d; d != top; d = replicas_delta_next(d))
+               if (__update_replicas(c, dst, &d->r, d->delta))
+                       goto need_mark;
+
+       dst->b.nr_inodes += deltas->nr_inodes;
+
+       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+               dst->b.reserved                 += deltas->persistent_reserved[i];
+               dst->persistent_reserved[i]     += deltas->persistent_reserved[i];
+       }
+
+       preempt_enable();
+       percpu_up_read(&c->mark_lock);
        return 0;
 need_mark:
        /* revert changes: */
@@ -1023,6 +1035,18 @@ int bch2_trigger_extent(struct btree_trans *trans,
                        struct bkey_s_c old, struct bkey_s new,
                        unsigned flags)
 {
+       struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
+       struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
+       unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
+       unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+
+       /* if pointers aren't changing - nothing to do: */
+       if (new_ptrs_bytes == old_ptrs_bytes &&
+           !memcmp(new_ptrs.start,
+                   old_ptrs.start,
+                   new_ptrs_bytes))
+               return 0;
+
        if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
                struct bch_fs *c = trans->c;
                int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
@@ -1072,7 +1096,7 @@ static int __trigger_reservation(struct btree_trans *trans,
                struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
 
                replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
-               fs_usage->reserved                              += sectors;
+               fs_usage->b.reserved                            += sectors;
                fs_usage->persistent_reserved[replicas - 1]     += sectors;
 
                preempt_enable();
@@ -1118,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
                        iter.pos.inode, iter.pos.offset, a->v.gen,
-                       bch2_data_types[a->v.data_type],
-                       bch2_data_types[type],
-                       bch2_data_types[type]);
+                       bch2_data_type_str(a->v.data_type),
+                       bch2_data_type_str(type),
+                       bch2_data_type_str(type));
                ret = -EIO;
                goto err;
        }
index 2c95cc5d86be661c6d6a0783d366d5d8b8b919d7..6387e039f7897534e27c207dd3818dc4b6afb3b7 100644 (file)
@@ -356,6 +356,8 @@ int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
        ret;                                                                                    \
 })
 
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
+
 void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
@@ -385,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
        return false;
 }
 
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+       return type < BCH_DATA_NR
+               ? __bch2_data_types[type]
+               : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+       if (type < BCH_DATA_NR)
+               prt_str(out, __bch2_data_types[type]);
+       else
+               prt_printf(out, "(invalid data type %u)", type);
+}
+
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
index 783f71017204cafa0277644a6d1b5564c779d366..6a31740222a7132e3f0735675ba63ed3402f00a8 100644 (file)
@@ -45,23 +45,18 @@ struct bch_dev_usage {
        }                       d[BCH_DATA_NR];
 };
 
-struct bch_fs_usage {
-       /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
        u64                     hidden;
        u64                     btree;
        u64                     data;
        u64                     cached;
        u64                     reserved;
        u64                     nr_inodes;
+};
 
-       /* XXX: add stats for compression ratio */
-#if 0
-       u64                     uncompressed;
-       u64                     compressed;
-#endif
-
-       /* broken out: */
-
+struct bch_fs_usage {
+       /* all fields are in units of 512 byte sectors: */
+       struct bch_fs_usage_base b;
        u64                     persistent_reserved[BCH_REPLICAS_MAX];
        u64                     replicas[];
 };
index f41889093a2c7eacaa1723667fc7bb2af5d0f3aa..3636444511064b51e5a004b953eacf94e7c70d12 100644 (file)
@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
        if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
                mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
 
-       while (1) {
+       do {
                set_current_state(TASK_INTERRUPTIBLE);
                if (kthread && kthread_should_stop())
                        break;
@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 
                schedule();
                try_to_freeze();
-       }
+       } while (0);
 
        __set_current_state(TASK_RUNNING);
        del_timer_sync(&wait.cpu_timer);
index 607fd5e232c902dbb39f3dac84ea2e214e6b106c..58c2eb45570ff022764720f9beb10ecfa2926367 100644 (file)
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
        return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
 }
 
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+       if (type < BCH_COMPRESSION_TYPE_NR)
+               prt_str(out, __bch2_compression_types[type]);
+       else
+               prt_printf(out, "(invalid compression type %u)", type);
+}
+
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
                                struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
index 6f13477ff652e9e0552b9fbbb49009a5651d6d76..4150feca42a2e65e63a59234a3e806ebbd09e1ac 100644 (file)
@@ -285,9 +285,7 @@ restart_drop_extra_replicas:
                                                k.k->p, bkey_start_pos(&insert->k)) ?:
                        bch2_insert_snapshot_whiteouts(trans, m->btree_id,
                                                k.k->p, insert->k.p) ?:
-                       bch2_bkey_set_needs_rebalance(c, insert,
-                                                     op->opts.background_target,
-                                                     op->opts.background_compression) ?:
+                       bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
                        bch2_trans_update(trans, &iter, insert,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
@@ -529,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
                BCH_WRITE_DATA_ENCODED|
                BCH_WRITE_MOVE|
                m->data_opts.write_flags;
-       m->op.compression_opt   = io_opts.background_compression ?: io_opts.compression;
+       m->op.compression_opt   = background_compression(io_opts);
        m->op.watermark         = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
 
        bkey_for_each_ptr(ptrs, ptr)
index d6418948495f8392898178dd9b350b1829a24aae..7bdba8507fc93cdfdecc29de3e70e5589cf8177b 100644 (file)
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
                return false;
 
        bio = bio_alloc_bioset(ca->disk_sb.bdev,
-                              buf_pages(n_sorted, btree_bytes(c)),
+                              buf_pages(n_sorted, btree_buf_bytes(b)),
                               REQ_OP_READ|REQ_META,
                               GFP_NOFS,
                               &c->btree_bio);
        bio->bi_iter.bi_sector  = pick.ptr.offset;
-       bch2_bio_map(bio, n_sorted, btree_bytes(c));
+       bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
 
        submit_bio_wait(bio);
 
        bio_put(bio);
        percpu_ref_put(&ca->io_ref);
 
-       memcpy(n_ondisk, n_sorted, btree_bytes(c));
+       memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
 
        v->written = 0;
        if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        mutex_lock(&c->verify_lock);
 
        if (!c->verify_ondisk) {
-               c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+               c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
                if (!c->verify_ondisk)
                        goto out;
        }
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
                return;
        }
 
-       n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+       n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
        if (!n_ondisk) {
                prt_printf(out, "memory allocation failure\n");
                goto out;
        }
 
        bio = bio_alloc_bioset(ca->disk_sb.bdev,
-                              buf_pages(n_ondisk, btree_bytes(c)),
+                              buf_pages(n_ondisk, btree_buf_bytes(b)),
                               REQ_OP_READ|REQ_META,
                               GFP_NOFS,
                               &c->btree_bio);
        bio->bi_iter.bi_sector  = pick.ptr.offset;
-       bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+       bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
 
        ret = submit_bio_wait(bio);
        if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 out:
        if (bio)
                bio_put(bio);
-       kvpfree(n_ondisk, btree_bytes(c));
+       kvpfree(n_ondisk, btree_buf_bytes(b));
        percpu_ref_put(&ca->io_ref);
 }
 
@@ -627,7 +627,7 @@ restart:
                prt_printf(&i->buf, "backtrace:");
                prt_newline(&i->buf);
                printbuf_indent_add(&i->buf, 2);
-               bch2_prt_task_backtrace(&i->buf, task, 0);
+               bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
                printbuf_indent_sub(&i->buf, 2);
                prt_newline(&i->buf);
 
index 4ae1e9f002a09b9c7ea3bed1709334f35373b061..ae29ad0c63e57466dc4cb5eb75728cb589f43eec 100644 (file)
@@ -144,19 +144,21 @@ fsck_err:
        return ret;
 }
 
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
-                        struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
        struct qstr d_name = bch2_dirent_get_name(d);
 
-       prt_printf(out, "%.*s -> %llu type %s",
-              d_name.len,
-              d_name.name,
-              d.v->d_type != DT_SUBVOL
-              ? le64_to_cpu(d.v->d_inum)
-              : le32_to_cpu(d.v->d_child_subvol),
-              bch2_d_type_str(d.v->d_type));
+       prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+       if (d.v->d_type != DT_SUBVOL)
+               prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+       else
+               prt_printf(out, "%u -> %u",
+                          le32_to_cpu(d.v->d_parent_subvol),
+                          le32_to_cpu(d.v->d_child_subvol));
+
+       prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
diff --git a/libbcachefs/dirent_format.h b/libbcachefs/dirent_format.h
new file mode 100644 (file)
index 0000000..5e116b8
--- /dev/null
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+       struct bch_val          v;
+
+       /* Target inode number: */
+       union {
+       __le64                  d_inum;
+       struct {                /* DT_SUBVOL */
+       __le32                  d_child_subvol;
+       __le32                  d_parent_subvol;
+       };
+       };
+
+       /*
+        * Copy of mode bits 12-15 from the target inode - so userspace can get
+        * the filetype without having to do a stat()
+        */
+       __u8                    d_type;
+
+       __u8                    d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL      16
+#define BCH_DT_MAX     17
+
+#define BCH_NAME_MAX   512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
index d802bc63c8d0b4832bd8062ce827c8af180361e6..d503af2700247d8aa1257962c37df9b042ee55ec 100644 (file)
@@ -190,7 +190,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
                                               a->v.stripe_redundancy, trans,
                                "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
                                iter.pos.inode, iter.pos.offset, a->v.gen,
-                               bch2_data_types[a->v.data_type],
+                               bch2_data_type_str(a->v.data_type),
                                a->v.dirty_sectors,
                                a->v.stripe, s.k->p.offset)) {
                        ret = -EIO;
@@ -200,7 +200,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
                if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
                                "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
                                iter.pos.inode, iter.pos.offset, a->v.gen,
-                               bch2_data_types[a->v.data_type],
+                               bch2_data_type_str(a->v.data_type),
                                a->v.dirty_sectors,
                                s.k->p.offset)) {
                        ret = -EIO;
@@ -367,7 +367,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
                }
        }
 
-       if (!(flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))) {
+       if (flags & BTREE_TRIGGER_ATOMIC) {
                struct stripe *m = genradix_ptr(&c->stripes, idx);
 
                if (!m) {
diff --git a/libbcachefs/ec_format.h b/libbcachefs/ec_format.h
new file mode 100644 (file)
index 0000000..44ce88b
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+       struct bch_val          v;
+       __le16                  sectors;
+       __u8                    algorithm;
+       __u8                    nr_blocks;
+       __u8                    nr_redundant;
+
+       __u8                    csum_granularity_bits;
+       __u8                    csum_type;
+       __u8                    pad;
+
+       struct bch_extent_ptr   ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
index 82ec056f4cdbb1f4e4234fce274939b61b7a5015..61395b113df9bdad67c0da7d2a4cc4f99664bc4e 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "btree_cache.h"
 #include "btree_gc.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -1018,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                        struct bch_extent_crc_unpacked crc =
                                bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+                       prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
                               crc.compressed_size,
                               crc.uncompressed_size,
                               crc.offset, crc.nonce,
-                              bch2_csum_types[crc.csum_type],
-                              bch2_compression_types[crc.compression_type]);
+                              bch2_csum_types[crc.csum_type]);
+                       bch2_prt_compression_type(out, crc.compression_type);
                        break;
                }
                case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1334,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
 }
 
 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
-                                 unsigned target, unsigned compression)
+                                 struct bch_io_opts *opts)
 {
        struct bkey_s k = bkey_i_to_s(_k);
        struct bch_extent_rebalance *r;
+       unsigned target = opts->background_target;
+       unsigned compression = background_compression(*opts);
        bool needs_rebalance;
 
        if (!bkey_extent_is_direct_data(k.k))
index a855c94d43ddb4f770f69807401f6d9dd5f66cbf..6bf839d69e84e6e24ed3bf2bf611177fc04676e1 100644 (file)
@@ -708,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
 
 int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
-                                 unsigned, unsigned);
+                                 struct bch_io_opts *);
 
 /* Generic extent code: */
 
diff --git a/libbcachefs/extents_format.h b/libbcachefs/extents_format.h
new file mode 100644 (file)
index 0000000..3bd2fdb
--- /dev/null
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32    - 0b1
+ * bch_extent_ptr      - 0b10
+ * bch_extent_crc64    - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES()               \
+       x(ptr,                  0)              \
+       x(crc32,                1)              \
+       x(crc64,                2)              \
+       x(crc128,               3)              \
+       x(stripe_ptr,           4)              \
+       x(rebalance,            5)
+#define BCH_EXTENT_ENTRY_MAX   6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+       BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u32                   type:2,
+                               _compressed_size:7,
+                               _uncompressed_size:7,
+                               offset:7,
+                               _unused:1,
+                               csum_type:4,
+                               compression_type:4;
+       __u32                   csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u32                   csum;
+       __u32                   compression_type:4,
+                               csum_type:4,
+                               _unused:1,
+                               offset:7,
+                               _uncompressed_size:7,
+                               _compressed_size:7,
+                               type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX         (1U << 7)
+#define CRC32_NONCE_MAX                0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:3,
+                               _compressed_size:9,
+                               _uncompressed_size:9,
+                               offset:9,
+                               nonce:10,
+                               csum_type:4,
+                               compression_type:4,
+                               csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   csum_hi:16,
+                               compression_type:4,
+                               csum_type:4,
+                               nonce:10,
+                               offset:9,
+                               _uncompressed_size:9,
+                               _compressed_size:9,
+                               type:3;
+#endif
+       __u64                   csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX         (1U << 9)
+#define CRC64_NONCE_MAX                ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:4,
+                               _compressed_size:13,
+                               _uncompressed_size:13,
+                               offset:13,
+                               nonce:13,
+                               csum_type:4,
+                               compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   compression_type:4,
+                               csum_type:4,
+                               nonce:13,
+                               offset:13,
+                               _uncompressed_size:13,
+                               _compressed_size:13,
+                               type:4;
+#endif
+       struct bch_csum         csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX                (1U << 13)
+#define CRC128_NONCE_MAX       ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:1,
+                               cached:1,
+                               unused:1,
+                               unwritten:1,
+                               offset:44, /* 8 petabytes */
+                               dev:8,
+                               gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   gen:8,
+                               dev:8,
+                               offset:44,
+                               unwritten:1,
+                               unused:1,
+                               cached:1,
+                               type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:5,
+                               block:8,
+                               redundancy:4,
+                               idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   idx:47,
+                               redundancy:4,
+                               block:8,
+                               type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:6,
+                               unused:34,
+                               compression:8, /* enum bch_compression_opt */
+                               target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   target:16,
+                               compression:8,
+                               unused:34,
+                               type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
+       unsigned long                   type;
+#elif __BITS_PER_LONG == 32
+       struct {
+               unsigned long           pad;
+               unsigned long           type;
+       };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f  f;
+       BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+       struct bch_val          v;
+
+       __u64                   _data[0];
+       struct bch_extent_ptr   start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+       struct bch_val          v;
+
+       __u64                   mem_ptr;
+       __le64                  seq;
+       __le16                  sectors_written;
+       __le16                  flags;
+       struct bpos             min_key;
+       __u64                   _data[0];
+       struct bch_extent_ptr   start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+       struct bch_val          v;
+
+       __u64                   _data[0];
+       union bch_extent_entry  start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+       ((sizeof(struct bch_extent_crc128) +                    \
+         sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX                               \
+       (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX                            \
+       ((sizeof(struct bch_btree_ptr_v2) +                     \
+         sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX                                        \
+       (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+       struct bch_val          v;
+
+       __le32                  generation;
+       __u8                    nr_replicas;
+       __u8                    pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+       struct bch_val          v;
+       u8                      data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
index 05429c9631cdad6eced17ff7638cd61651e12bf5..b04750dbf870bc78c95ece35d363e3a4c0936b50 100644 (file)
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 }
 
 #define eytzinger1_for_each(_i, _size)                 \
-       for ((_i) = eytzinger1_first((_size));          \
+       for (unsigned (_i) = eytzinger1_first((_size)); \
             (_i) != 0;                                 \
             (_i) = eytzinger1_next((_i), (_size)))
 
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 }
 
 #define eytzinger0_for_each(_i, _size)                 \
-       for ((_i) = eytzinger0_first((_size));          \
+       for (unsigned (_i) = eytzinger0_first((_size)); \
             (_i) != -1;                                \
             (_i) = eytzinger0_next((_i), (_size)))
 
@@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 #define eytzinger0_find(base, nr, size, _cmp, search)                  \
 ({                                                                     \
-       void *_base     = (base);                                       \
-       void *_search   = (search);                                     \
-       size_t _nr      = (nr);                                         \
-       size_t _size    = (size);                                       \
-       size_t _i       = 0;                                            \
+       void *_base             = (base);                               \
+       const void *_search     = (search);                             \
+       size_t _nr              = (nr);                                 \
+       size_t _size            = (size);                               \
+       size_t _i               = 0;                                    \
        int _res;                                                       \
                                                                        \
        while (_i < _nr &&                                              \
index fdd57c5785c9cebf609959fb753ee30e55e85b92..e3b219e19e1008ccfe1ff61e966115795f9c1831 100644 (file)
@@ -77,6 +77,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
 
        bch2_inode_opts_get(&opts, c, &inode->ei_inode);
 
+       /* bios must be 512 byte aligned: */
+       if ((offset|iter->count) & (SECTOR_SIZE - 1))
+               return -EINVAL;
+
        ret = min_t(loff_t, iter->count,
                    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
 
index ff664fd0d8ef80e8b4816d7c430e87d41759b498..d359aa9b33b828342bd466b899713f401d939b30 100644 (file)
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
        }
 }
 
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
-                                 u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+                                u64 *start, u64 end,
+                                bool nonblocking)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
        pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
        struct folio_batch fbatch;
        s64 i_sectors_delta = 0;
-       unsigned i, j;
+       int ret = 0;
 
-       if (end <= start)
-               return;
+       if (end <= *start)
+               return 0;
 
        folio_batch_init(&fbatch);
 
        while (filemap_get_folios(inode->v.i_mapping,
                                  &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+               for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
+
+                       if (!nonblocking)
+                               folio_lock(folio);
+                       else if (!folio_trylock(folio)) {
+                               folio_batch_release(&fbatch);
+                               ret = -EAGAIN;
+                               break;
+                       }
+
                        u64 folio_start = folio_sector(folio);
                        u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
 
                        BUG_ON(end <= folio_start);
 
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
+                       *start = min(end, folio_end);
 
+                       struct bch_folio *s = bch2_folio(folio);
                        if (s) {
+                               unsigned folio_offset = max(*start, folio_start) - folio_start;
+                               unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
                                spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++) {
+                               for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
                                        i_sectors_delta -= s->s[j].state == SECTOR_dirty;
                                        bch2_folio_sector_set(folio, s, j,
                                                folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
        }
 
        bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       return ret;
 }
 
 static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
index 27f712ae37a68209275cc3b2955a542314e80e68..8cbaba6565b4493695d679fe41553c197468c752 100644 (file)
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
 void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
 
 void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
 
 int bch2_get_folio_disk_reservation(struct bch_fs *,
                                struct bch_inode_info *,
index 98bd5babab193bec842dce20b0783e6c958ac5bf..8c70123b6a0c809b6d50040593281c2e9c115828 100644 (file)
@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
                        continue;
 
                bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-                                                   REQ_OP_FLUSH,
+                                                   REQ_OP_WRITE|REQ_PREFLUSH,
                                                    GFP_KERNEL,
                                                    &c->nocow_flush_bioset),
                                   struct nocow_flush, bio);
@@ -675,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
                bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-               drop_locks_do(trans,
-                       (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+               if (bch2_mark_pagecache_reserved(inode, &hole_start,
+                                                iter.pos.offset, true))
+                       drop_locks_do(trans,
+                               bch2_mark_pagecache_reserved(inode, &hole_start,
+                                                            iter.pos.offset, false));
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
index e0a19a73c8e1a6dae41b0f9d0f7f96226e07260e..3a4c24c28e7fa06deff38f6bb0b240a5daacda8c 100644 (file)
@@ -287,12 +287,12 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
 
        switch (flags) {
        case FSOP_GOING_FLAGS_DEFAULT:
-               ret = freeze_bdev(c->vfs_sb->s_bdev);
+               ret = bdev_freeze(c->vfs_sb->s_bdev);
                if (ret)
                        break;
                bch2_journal_flush(&c->journal);
                bch2_fs_emergency_read_only(c);
-               thaw_bdev(c->vfs_sb->s_bdev);
+               bdev_thaw(c->vfs_sb->s_bdev);
                break;
        case FSOP_GOING_FLAGS_LOGFLUSH:
                bch2_journal_flush(&c->journal);
@@ -337,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
        if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
                create_flags |= BCH_CREATE_SNAPSHOT_RO;
 
-       /* why do we need this lock? */
-       down_read(&c->vfs_sb->s_umount);
-
-       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+               /* sync_inodes_sb enforce s_umount is locked */
+               down_read(&c->vfs_sb->s_umount);
                sync_inodes_sb(c->vfs_sb);
+               up_read(&c->vfs_sb->s_umount);
+       }
 retry:
        if (arg.src_ptr) {
                error = user_path_at(arg.dirfd,
@@ -425,8 +426,6 @@ err2:
                goto retry;
        }
 err1:
-       up_read(&c->vfs_sb->s_umount);
-
        return error;
 }
 
@@ -443,33 +442,36 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
                                struct bch_ioctl_subvolume arg)
 {
+       const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
        struct path path;
        struct inode *dir;
+       struct dentry *victim;
        int ret = 0;
 
        if (arg.flags)
                return -EINVAL;
 
-       ret = user_path_at(arg.dirfd,
-                       (const char __user *)(unsigned long)arg.dst_ptr,
-                       LOOKUP_FOLLOW, &path);
-       if (ret)
-               return ret;
+       victim = user_path_locked_at(arg.dirfd, name, &path);
+       if (IS_ERR(victim))
+               return PTR_ERR(victim);
 
-       if (path.dentry->d_sb->s_fs_info != c) {
+       if (victim->d_sb->s_fs_info != c) {
                ret = -EXDEV;
                goto err;
        }
-
-       dir = path.dentry->d_parent->d_inode;
-
-       ret = __bch2_unlink(dir, path.dentry, true);
-       if (ret)
+       if (!d_is_positive(victim)) {
+               ret = -ENOENT;
                goto err;
-
-       fsnotify_rmdir(dir, path.dentry);
-       d_delete(path.dentry);
+       }
+       dir = d_inode(path.dentry);
+       ret = __bch2_unlink(dir, victim, true);
+       if (!ret) {
+               fsnotify_rmdir(dir, victim);
+               d_delete(victim);
+       }
+       inode_unlock(dir);
 err:
+       dput(victim);
        path_put(&path);
        return ret;
 }
index da11757682c0967950e9e9ea7d1e4d862937d0ce..ec419b8e2c43123b42e0d84c837611fc5f6e2314 100644 (file)
@@ -1129,7 +1129,7 @@ static const struct address_space_operations bch_address_space_operations = {
 #ifdef CONFIG_MIGRATION
        .migrate_folio  = filemap_migrate_folio,
 #endif
-       .error_remove_page = generic_error_remove_page,
+       .error_remove_folio = generic_error_remove_folio,
 };
 
 struct bcachefs_fid {
index 4f0ecd60567570b7364cef517225ea0e3dfa5575..7e82c7bc0ca82289b36e4f32be5e0ce343874748 100644 (file)
@@ -119,7 +119,6 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
        if (!ret)
                *snapshot = iter.pos.snapshot;
 err:
-       bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -250,7 +249,9 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
         * The bch2_check_dirents pass has already run, dangling dirents
         * shouldn't exist here:
         */
-       return lookup_inode(trans, inum, lostfound, &snapshot);
+       ret = lookup_inode(trans, inum, lostfound, &snapshot);
+       bch_err_msg(c, ret, "looking up lost+found");
+       return ret;
 
 create_lostfound:
        /*
index 37dce96f48ac42d28b98d99e75a77b049e04de8f..086f0090b03a4015388dce49388ba5951940cb0a 100644 (file)
@@ -506,22 +506,33 @@ fsck_err:
 static void __bch2_inode_unpacked_to_text(struct printbuf *out,
                                          struct bch_inode_unpacked *inode)
 {
-       prt_printf(out, "mode=%o ", inode->bi_mode);
+       printbuf_indent_add(out, 2);
+       prt_printf(out, "mode=%o", inode->bi_mode);
+       prt_newline(out);
 
        prt_str(out, "flags=");
        prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
        prt_printf(out, " (%x)", inode->bi_flags);
+       prt_newline(out);
 
-       prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
-              inode->bi_journal_seq,
-              inode->bi_size,
-              inode->bi_sectors,
-              inode->bi_version);
+       prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+       prt_newline(out);
+
+       prt_printf(out, "bi_size=%llu", inode->bi_size);
+       prt_newline(out);
+
+       prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+       prt_newline(out);
+
+       prt_newline(out);
+       prt_printf(out, "bi_version=%llu", inode->bi_version);
 
 #define x(_name, _bits)                                                \
-       prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+       prt_printf(out, #_name "=%llu", (u64) inode->_name);    \
+       prt_newline(out);
        BCH_INODE_FIELDS_v3()
 #undef  x
+       printbuf_indent_sub(out, 2);
 }
 
 void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -587,7 +598,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
                }
        }
 
-       if (!(flags & BTREE_TRIGGER_TRANSACTIONAL) && (flags & BTREE_TRIGGER_INSERT)) {
+       if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
                BUG_ON(!trans->journal_res.seq);
 
                bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -597,7 +608,7 @@ int bch2_trigger_inode(struct btree_trans *trans,
                struct bch_fs *c = trans->c;
 
                percpu_down_read(&c->mark_lock);
-               this_cpu_add(c->usage_gc->nr_inodes, nr);
+               this_cpu_add(c->usage_gc->b.nr_inodes, nr);
                percpu_up_read(&c->mark_lock);
        }
 
diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h
new file mode 100644 (file)
index 0000000..83d1073
--- /dev/null
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX     4096
+#define BCACHEFS_ROOT_INO      4096
+
+struct bch_inode {
+       struct bch_val          v;
+
+       __le64                  bi_hash_seed;
+       __le32                  bi_flags;
+       __le16                  bi_mode;
+       __u8                    fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+       struct bch_val          v;
+
+       __le64                  bi_journal_seq;
+       __le64                  bi_hash_seed;
+       __le64                  bi_flags;
+       __le16                  bi_mode;
+       __u8                    fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+       struct bch_val          v;
+
+       __le64                  bi_journal_seq;
+       __le64                  bi_hash_seed;
+       __le64                  bi_flags;
+       __le64                  bi_sectors;
+       __le64                  bi_size;
+       __le64                  bi_version;
+       __u8                    fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL   6
+#define INODEv3_FIELDS_START_CUR       (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+       struct bch_val          v;
+
+       __le32                  bi_generation;
+       __le32                  pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2()                  \
+       x(bi_atime,                     96)     \
+       x(bi_ctime,                     96)     \
+       x(bi_mtime,                     96)     \
+       x(bi_otime,                     96)     \
+       x(bi_size,                      64)     \
+       x(bi_sectors,                   64)     \
+       x(bi_uid,                       32)     \
+       x(bi_gid,                       32)     \
+       x(bi_nlink,                     32)     \
+       x(bi_generation,                32)     \
+       x(bi_dev,                       32)     \
+       x(bi_data_checksum,             8)      \
+       x(bi_compression,               8)      \
+       x(bi_project,                   32)     \
+       x(bi_background_compression,    8)      \
+       x(bi_data_replicas,             8)      \
+       x(bi_promote_target,            16)     \
+       x(bi_foreground_target,         16)     \
+       x(bi_background_target,         16)     \
+       x(bi_erasure_code,              16)     \
+       x(bi_fields_set,                16)     \
+       x(bi_dir,                       64)     \
+       x(bi_dir_offset,                64)     \
+       x(bi_subvol,                    32)     \
+       x(bi_parent_subvol,             32)
+
+#define BCH_INODE_FIELDS_v3()                  \
+       x(bi_atime,                     96)     \
+       x(bi_ctime,                     96)     \
+       x(bi_mtime,                     96)     \
+       x(bi_otime,                     96)     \
+       x(bi_uid,                       32)     \
+       x(bi_gid,                       32)     \
+       x(bi_nlink,                     32)     \
+       x(bi_generation,                32)     \
+       x(bi_dev,                       32)     \
+       x(bi_data_checksum,             8)      \
+       x(bi_compression,               8)      \
+       x(bi_project,                   32)     \
+       x(bi_background_compression,    8)      \
+       x(bi_data_replicas,             8)      \
+       x(bi_promote_target,            16)     \
+       x(bi_foreground_target,         16)     \
+       x(bi_background_target,         16)     \
+       x(bi_erasure_code,              16)     \
+       x(bi_fields_set,                16)     \
+       x(bi_dir,                       64)     \
+       x(bi_dir_offset,                64)     \
+       x(bi_subvol,                    32)     \
+       x(bi_parent_subvol,             32)     \
+       x(bi_nocow,                     8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS()                       \
+       x(data_checksum,                8)      \
+       x(compression,                  8)      \
+       x(project,                      32)     \
+       x(background_compression,       8)      \
+       x(data_replicas,                8)      \
+       x(promote_target,               16)     \
+       x(foreground_target,            16)     \
+       x(background_target,            16)     \
+       x(erasure_code,                 16)     \
+       x(nocow,                        8)
+
+enum inode_opt_id {
+#define x(name, ...)                           \
+       Inode_opt_##name,
+       BCH_INODE_OPTS()
+#undef  x
+       Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS()                      \
+       x(sync,                         0)      \
+       x(immutable,                    1)      \
+       x(append,                       2)      \
+       x(nodump,                       3)      \
+       x(noatime,                      4)      \
+       x(i_size_dirty,                 5)      \
+       x(i_sectors_dirty,              6)      \
+       x(unlinked,                     7)      \
+       x(backptr_untrusted,            8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n)        BCH_INODE_##t = 1U << n,
+       BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n)        __BCH_INODE_##t = n,
+       BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS,        struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+                               struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE,     struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
index ca6d5f516aa2be80824e7479e73d1cbfc2607117..1baf78594ccaf85d7d89fea4fc938a7f700d6dc0 100644 (file)
@@ -442,9 +442,7 @@ case LOGGED_OP_FINSERT_shift_extents:
 
                op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
 
-               ret =   bch2_bkey_set_needs_rebalance(c, copy,
-                                       opts.background_target,
-                                       opts.background_compression) ?:
+               ret =   bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
                        bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
                        bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
                        bch2_logged_op_update(trans, &op->k_i) ?:
index 33c0e783d54697b50c490309726b49eacb410189..ef3a53f9045af2591ab1f9e272dd9d6151250444 100644 (file)
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
                                     bkey_start_pos(&sk.k->k),
                                     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-               ret =   bch2_bkey_set_needs_rebalance(c, sk.k,
-                                       op->opts.background_target,
-                                       op->opts.background_compression) ?:
+               ret =   bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
                        bch2_extent_update(trans, inum, &iter, sk.k,
                                        &op->res,
                                        op->new_i_size, &op->i_sectors_delta,
@@ -1447,10 +1445,11 @@ err:
                        op->flags |= BCH_WRITE_DONE;
 
                        if (ret < 0) {
-                               bch_err_inum_offset_ratelimited(c,
-                                       op->pos.inode,
-                                       op->pos.offset << 9,
-                                       "%s(): error: %s", __func__, bch2_err_str(ret));
+                               if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+                                       bch_err_inum_offset_ratelimited(c,
+                                               op->pos.inode,
+                                               op->pos.offset << 9,
+                                               "%s(): error: %s", __func__, bch2_err_str(ret));
                                op->error = ret;
                                break;
                        }
index 8538ef34f62bc54e8bc570acbe793e4771745247..bc890776eb57933a5931edd2a2f07570f52b7ab3 100644 (file)
@@ -27,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
        NULL
 };
 
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+       union journal_res_state s = READ_ONCE(j->reservations);
+       unsigned i = seq & JOURNAL_BUF_MASK;
+       struct journal_buf *buf = j->buf + i;
+
+       prt_printf(out, "seq:");
+       prt_tab(out);
+       prt_printf(out, "%llu", seq);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_printf(out, "refcount:");
+       prt_tab(out);
+       prt_printf(out, "%u", journal_state_count(s, i));
+       prt_newline(out);
+
+       prt_printf(out, "size:");
+       prt_tab(out);
+       prt_human_readable_u64(out, vstruct_bytes(buf->data));
+       prt_newline(out);
+
+       prt_printf(out, "expires");
+       prt_tab(out);
+       prt_printf(out, "%li jiffies", buf->expires - jiffies);
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+       if (!out->nr_tabstops)
+               printbuf_tabstop_push(out, 24);
+
+       for (u64 seq = journal_last_unwritten_seq(j);
+            seq <= journal_cur_seq(j);
+            seq++)
+               bch2_journal_buf_to_text(out, j, seq);
+}
+
 static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
 {
        return seq > j->seq_ondisk;
@@ -156,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
  * We don't close a journal_buf until the next journal_buf is finished writing,
  * and can be opened again - this also initializes the next journal_buf:
  */
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf = journal_cur_buf(j);
@@ -185,7 +226,17 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
        /* Close out old buffer: */
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
-       trace_journal_entry_close(c, vstruct_bytes(buf->data));
+       if (trace_journal_entry_close_enabled() && trace) {
+               struct printbuf pbuf = PRINTBUF;
+               pbuf.atomic++;
+
+               prt_str(&pbuf, "entry size: ");
+               prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+               prt_newline(&pbuf);
+               bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+               trace_journal_entry_close(c, pbuf.buf);
+               printbuf_exit(&pbuf);
+       }
 
        sectors = vstruct_blocks_plus(buf->data, c->block_bits,
                                      buf->u64s_reserved) << c->block_bits;
@@ -225,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
 void bch2_journal_halt(struct journal *j)
 {
        spin_lock(&j->lock);
-       __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+       __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
        if (!j->err_seq)
                j->err_seq = journal_cur_seq(j);
        journal_wake(j);
@@ -239,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
 
        /* Don't close it yet if we already have a write in flight: */
        if (ret)
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
        else if (nr_unwritten_journal_entries(j)) {
                struct journal_buf *buf = journal_cur_buf(j);
 
@@ -406,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
        if (delta > 0)
                mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
        else
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 unlock:
        spin_unlock(&j->lock);
 }
@@ -463,13 +514,21 @@ retry:
            buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
                j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
        ret = journal_entry_open(j);
 
        if (ret == JOURNAL_ERR_max_in_flight) {
                track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
                                   &j->max_in_flight_start, true);
-               trace_and_count(c, journal_entry_full, c);
+               if (trace_journal_entry_full_enabled()) {
+                       struct printbuf buf = PRINTBUF;
+                       buf.atomic++;
+
+                       bch2_journal_bufs_to_text(&buf, j);
+                       trace_journal_entry_full(c, buf.buf);
+                       printbuf_exit(&buf);
+               }
+               count_event(c, journal_entry_full);
        }
 unlock:
        can_discard = j->can_discard;
@@ -549,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
                /*
                 * Not enough room in current journal entry, have to flush it:
                 */
-               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
        } else {
                journal_cur_buf(j)->u64s_reserved += d;
        }
@@ -606,7 +665,7 @@ recheck_need_open:
                struct journal_res res = { 0 };
 
                if (journal_entry_is_open(j))
-                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 
                spin_unlock(&j->lock);
 
@@ -786,7 +845,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
 
                if (buf->need_flush_to_write_buffer) {
                        if (seq == journal_cur_seq(j))
-                               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+                               __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
 
                        union journal_res_state s;
                        s.v = atomic64_read_acquire(&j->reservations.counter);
@@ -1339,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        }
 
        prt_newline(out);
-
-       for (u64 seq = journal_cur_seq(j);
-            seq >= journal_last_unwritten_seq(j);
-            --seq) {
-               unsigned i = seq & JOURNAL_BUF_MASK;
-
-               prt_printf(out, "unwritten entry:");
-               prt_tab(out);
-               prt_printf(out, "%llu", seq);
-               prt_newline(out);
-               printbuf_indent_add(out, 2);
-
-               prt_printf(out, "refcount:");
-               prt_tab(out);
-               prt_printf(out, "%u", journal_state_count(s, i));
-               prt_newline(out);
-
-               prt_printf(out, "sectors:");
-               prt_tab(out);
-               prt_printf(out, "%u", j->buf[i].sectors);
-               prt_newline(out);
-
-               prt_printf(out, "expires");
-               prt_tab(out);
-               prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
-               prt_newline(out);
-
-               printbuf_indent_sub(out, 2);
-       }
+       prt_printf(out, "unwritten entries:");
+       prt_newline(out);
+       bch2_journal_bufs_to_text(out, j);
 
        prt_printf(out,
               "replay done:\t\t%i\n",
index b0f4dd491e1205d28c6af528fb59696cdbc4dc9c..bfd6585e746da45880da9b5ad8fb502586cbf933 100644 (file)
@@ -683,10 +683,7 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
        prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
 
        for (i = 0; i < nr_types; i++) {
-               if (i < BCH_DATA_NR)
-                       prt_printf(out, " %s", bch2_data_types[i]);
-               else
-                       prt_printf(out, " (unknown data type %u)", i);
+               bch2_prt_data_type(out, i);
                prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
                       le64_to_cpu(u->d[i].buckets),
                       le64_to_cpu(u->d[i].sectors),
@@ -1991,7 +1988,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
                        percpu_ref_get(&ca->io_ref);
 
                        bio = ca->journal.bio;
-                       bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+                       bio_reset(bio, ca->disk_sb.bdev,
+                                 REQ_OP_WRITE|REQ_PREFLUSH);
                        bio->bi_end_io          = journal_write_endio;
                        bio->bi_private         = ca;
                        closure_bio_submit(bio, cl);
diff --git a/libbcachefs/logged_ops_format.h b/libbcachefs/logged_ops_format.h
new file mode 100644 (file)
index 0000000..6a4bf71
--- /dev/null
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+       struct bch_val          v;
+       __le32                  subvol;
+       __le32                  pad;
+       __le64                  inum;
+       __le64                  new_i_size;
+};
+
+enum logged_op_finsert_state {
+       LOGGED_OP_FINSERT_start,
+       LOGGED_OP_FINSERT_shift_extents,
+       LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+       struct bch_val          v;
+       __u8                    state;
+       __u8                    pad[3];
+       __le32                  subvol;
+       __le64                  inum;
+       __le64                  dst_offset;
+       __le64                  src_offset;
+       __le64                  pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
index 7a33319dcd168001594f6532bafe0caf92f83c22..bf68ea49447b95055a4f6a1e6e7c6a7e373aebc5 100644 (file)
@@ -6,9 +6,11 @@
 #include "backpointers.h"
 #include "bkey_buf.h"
 #include "btree_gc.h"
+#include "btree_io.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "compress.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "errcode.h"
@@ -34,12 +36,46 @@ const char * const bch2_data_ops_strs[] = {
        NULL
 };
 
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+                                         struct bch_io_opts *io_opts,
+                                         struct data_update_opts *data_opts)
+{
+       printbuf_tabstop_push(out, 20);
+       prt_str(out, "rewrite ptrs:");
+       prt_tab(out);
+       bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+       prt_newline(out);
+
+       prt_str(out, "kill ptrs: ");
+       prt_tab(out);
+       bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+       prt_newline(out);
+
+       prt_str(out, "target: ");
+       prt_tab(out);
+       bch2_target_to_text(out, c, data_opts->target);
+       prt_newline(out);
+
+       prt_str(out, "compression: ");
+       prt_tab(out);
+       bch2_compression_opt_to_text(out, background_compression(*io_opts));
+       prt_newline(out);
+
+       prt_str(out, "extra replicas: ");
+       prt_tab(out);
+       prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+                              struct bch_io_opts *io_opts,
+                              struct data_update_opts *data_opts)
 {
        if (trace_move_extent_enabled()) {
                struct printbuf buf = PRINTBUF;
 
                bch2_bkey_val_to_text(&buf, c, k);
+               prt_newline(&buf);
+               bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
                trace_move_extent(c, buf.buf);
                printbuf_exit(&buf);
        }
@@ -111,6 +147,15 @@ static void move_write(struct moving_io *io)
                return;
        }
 
+       if (trace_move_extent_write_enabled()) {
+               struct bch_fs *c = io->write.op.c;
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+               trace_move_extent_write(c, buf.buf);
+               printbuf_exit(&buf);
+       }
+
        closure_get(&io->write.ctxt->cl);
        atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
        atomic_inc(&io->write.ctxt->write_ios);
@@ -241,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
        unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
+       trace_move_extent2(c, k, &io_opts, &data_opts);
+
        if (ctxt->stats)
                ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
-       trace_move_extent2(c, k);
 
        bch2_data_update_opts_normalize(k, &data_opts);
 
@@ -759,6 +805,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
                        if (!b)
                                goto next;
 
+                       unsigned sectors = btree_ptr_sectors_written(&b->key);
+
                        ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
                        bch2_trans_iter_exit(trans, &iter);
 
@@ -768,11 +816,10 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
                                goto err;
 
                        if (ctxt->rate)
-                               bch2_ratelimit_increment(ctxt->rate,
-                                                        c->opts.btree_node_size >> 9);
+                               bch2_ratelimit_increment(ctxt->rate, sectors);
                        if (ctxt->stats) {
-                               atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
-                               atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+                               atomic64_add(sectors, &ctxt->stats->sectors_seen);
+                               atomic64_add(sectors, &ctxt->stats->sectors_moved);
                        }
                }
 next:
@@ -1083,9 +1130,9 @@ int bch2_data_job(struct bch_fs *c,
 
 void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
 {
-       prt_printf(out, "%s: data type=%s pos=",
-                  stats->name,
-                  bch2_data_types[stats->data_type]);
+       prt_printf(out, "%s: data type==", stats->name);
+       bch2_prt_data_type(out, stats->data_type);
+       prt_str(out, " pos=");
        bch2_bbpos_to_text(out, stats->pos);
        prt_newline(out);
        printbuf_indent_add(out, 2);
index 8e6f230eac38155bf5d048367d6ebde35a4a15bd..b1ed0b9a20d35d61491ce0cff28b4bb2c7be42c3 100644 (file)
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
        NULL
 };
 
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
        BCH_COMPRESSION_TYPES()
        NULL
 };
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
        NULL
 };
 
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
        BCH_DATA_TYPES()
        NULL
 };
index 93a24fef42148488cdddb391cd291dd0e0168063..9a4b7faa376503993f1c2da8f8d1e5963ef6ca5a 100644 (file)
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
 extern const char * const __bch2_btree_ids[];
 extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
 extern const char * const bch2_member_states[];
 extern const char * const bch2_jset_entry_types[];
 extern const char * const bch2_fs_usage_types[];
@@ -564,6 +564,11 @@ struct bch_io_opts {
 #undef x
 };
 
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+       return opts.background_compression ?: opts.compression;
+}
+
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
 bool bch2_opt_is_inode_opt(enum bch_opt_id);
 
diff --git a/libbcachefs/quota_format.h b/libbcachefs/quota_format.h
new file mode 100644 (file)
index 0000000..dc34347
--- /dev/null
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+       QTYP_USR                = 0,
+       QTYP_GRP                = 1,
+       QTYP_PRJ                = 2,
+       QTYP_NR                 = 3,
+};
+
+enum quota_counters {
+       Q_SPC                   = 0,
+       Q_INO                   = 1,
+       Q_COUNTERS              = 2,
+};
+
+struct bch_quota_counter {
+       __le64                  hardlimit;
+       __le64                  softlimit;
+};
+
+struct bch_quota {
+       struct bch_val          v;
+       struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+       __le32                          timelimit;
+       __le32                          warnlimit;
+};
+
+struct bch_sb_quota_type {
+       __le64                          flags;
+       struct bch_sb_quota_counter     c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+       struct bch_sb_field             field;
+       struct bch_sb_quota_type        q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
index 95f46cb3b5bdfd820e845a8cceda2b3c2fb67cf4..22d1017aa49b975756905a9a69ce8bcd82416ca3 100644 (file)
@@ -177,8 +177,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
                prt_str(&buf, "target=");
                bch2_target_to_text(&buf, c, r->target);
                prt_str(&buf, " compression=");
-               struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
-               prt_str(&buf, bch2_compression_opts[opt.type]);
+               bch2_compression_opt_to_text(&buf, r->compression);
                prt_str(&buf, " ");
                bch2_bkey_val_to_text(&buf, c, k);
 
@@ -254,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
 
        if (k.k->p.inode) {
                target          = io_opts->background_target;
-               compression     = io_opts->background_compression ?: io_opts->compression;
+               compression     = background_compression(*io_opts);
        } else {
                const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
 
                target          = r ? r->target : io_opts->background_target;
-               compression     = r ? r->compression :
-                       (io_opts->background_compression ?: io_opts->compression);
+               compression     = r ? r->compression : background_compression(*io_opts);
        }
 
        data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -371,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
            !kthread_should_stop() &&
            !atomic64_read(&r->work_stats.sectors_seen) &&
            !atomic64_read(&r->scan_stats.sectors_seen)) {
+               bch2_moving_ctxt_flush_all(ctxt);
                bch2_trans_unlock_long(trans);
                rebalance_wait(c);
        }
@@ -385,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
        struct bch_fs *c = arg;
        struct bch_fs_rebalance *r = &c->rebalance;
        struct moving_context ctxt;
-       int ret;
 
        set_freezable();
 
@@ -393,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
                              writepoint_ptr(&c->rebalance_write_point),
                              true);
 
-       while (!kthread_should_stop() &&
-              !(ret = do_rebalance(&ctxt)))
+       while (!kthread_should_stop() && !do_rebalance(&ctxt))
                ;
 
        bch2_moving_ctxt_exit(&ctxt);
index 725214605a050996196c28a9132f8fe247e76d28..9127d0e3ca2f6a3fd44e076b42f01ee6f7736427 100644 (file)
@@ -280,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                        le64_to_cpu(u->v);
                        break;
                case BCH_FS_USAGE_inodes:
-                       c->usage_base->nr_inodes = le64_to_cpu(u->v);
+                       c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
                        break;
                case BCH_FS_USAGE_key_version:
                        atomic64_set(&c->key_version,
index b24b71bc4e60956917a7a90008357899e4325469..c47c66c2b394dc8df391fa3adf8bfea03e1e447e 100644 (file)
@@ -292,23 +292,15 @@ static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *f
        }
 }
 
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
-                             enum btree_id btree_id, unsigned level,
-                             struct bkey_s_c old, struct bkey_s new,
-                             unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c old, struct bkey_s new,
+                          unsigned flags)
 {
        if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
            (flags & BTREE_TRIGGER_INSERT))
                check_indirect_extent_deleting(new, &flags);
 
-       if (old.k->type == KEY_TYPE_reflink_v &&
-           new.k->type == KEY_TYPE_reflink_v &&
-           old.k->u64s == new.k->u64s &&
-           !memcmp(bkey_s_c_to_reflink_v(old).v->start,
-                   bkey_s_to_reflink_v(new).v->start,
-                   bkey_val_bytes(new.k) - 8))
-               return 0;
-
        return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
 }
 
@@ -332,7 +324,7 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
               min(datalen, 32U), d.v->data);
 }
 
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
                              enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old, struct bkey_s new,
                              unsigned flags)
@@ -494,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
 
                bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
 
+               if (dst_inum.inum < src_inum.inum) {
+                       /* Avoid some lock cycle transaction restarts */
+                       ret = bch2_btree_iter_traverse(&dst_iter);
+                       if (ret)
+                               continue;
+               }
+
                dst_done = dst_iter.pos.offset - dst_start.offset;
                src_want = POS(src_start.inode, src_start.offset + dst_done);
                bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -546,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                                min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter.pos.offset));
 
-               ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k,
-                                       opts.background_target,
-                                       opts.background_compression) ?:
+               ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
                        bch2_extent_update(trans, dst_inum, &dst_iter,
                                        new_dst.k, &disk_res,
                                        new_i_size, i_sectors_delta,
index 8ee778ec0022a327145eb91ebefbcb38cc1240bf..4d8867289717bf6cf46f05b0c58e3adcc42efae7 100644 (file)
@@ -24,14 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
                           enum bkey_invalid_flags, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
                              struct bkey_s_c, struct bkey_s, unsigned);
 
 #define bch2_bkey_ops_reflink_v ((struct bkey_ops) {           \
        .key_invalid    = bch2_reflink_v_invalid,               \
        .val_to_text    = bch2_reflink_v_to_text,               \
        .swab           = bch2_ptr_swab,                        \
-       .trigger        = bch2_trans_mark_reflink_v,            \
+       .trigger        = bch2_trigger_reflink_v,               \
        .min_val_size   = 8,                                    \
 })
 
@@ -39,7 +39,7 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
                                      enum bkey_invalid_flags, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
                                struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
                                         enum btree_id, unsigned,
                              struct bkey_s_c, struct bkey_s,
                              unsigned);
@@ -47,7 +47,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
 #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {        \
        .key_invalid    = bch2_indirect_inline_data_invalid,    \
        .val_to_text    = bch2_indirect_inline_data_to_text,    \
-       .trigger        = bch2_trans_mark_indirect_inline_data, \
+       .trigger        = bch2_trigger_indirect_inline_data,    \
        .min_val_size   = 8,                                    \
 })
 
diff --git a/libbcachefs/reflink_format.h b/libbcachefs/reflink_format.h
new file mode 100644 (file)
index 0000000..6772eeb
--- /dev/null
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+       struct bch_val          v;
+       __le64                  idx;
+       /*
+        * A reflink pointer might point to an indirect extent which is then
+        * later split (by copygc or rebalance). If we only pointed to part of
+        * the original indirect extent, and then one of the fragments is
+        * outside the range we point to, we'd leak a refcount: so when creating
+        * reflink pointers, we need to store pad values to remember the full
+        * range we were taking a reference on.
+        */
+       __le32                  front_pad;
+       __le32                  back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+       struct bch_val          v;
+       __le64                  refcount;
+       union bch_extent_entry  start[0];
+       __u64                   _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+       struct bch_val          v;
+       __le64                  refcount;
+       u8                      data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
index 92ba56ef1fc89690656e9625871ecd7ee38b5f9b..cc2672c120312c39f82e9a1a9afe0ed959b15dba 100644 (file)
@@ -9,6 +9,12 @@
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
                                            struct bch_replicas_cpu *);
 
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r, size_t size)
+{
+       return memcmp(l, r, size);
+}
+
 /* Replicas tracking - in memory: */
 
 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
@@ -33,21 +39,16 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-       eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+       eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
                                           struct bch_replicas_entry_v0 *e)
 {
-       unsigned i;
-
-       if (e->data_type < BCH_DATA_NR)
-               prt_printf(out, "%s", bch2_data_types[e->data_type]);
-       else
-               prt_printf(out, "(invalid data type %u)", e->data_type);
+       bch2_prt_data_type(out, e->data_type);
 
        prt_printf(out, ": %u [", e->nr_devs);
-       for (i = 0; i < e->nr_devs; i++)
+       for (unsigned i = 0; i < e->nr_devs; i++)
                prt_printf(out, i ? " %u" : "%u", e->devs[i]);
        prt_printf(out, "]");
 }
@@ -55,15 +56,10 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
 void bch2_replicas_entry_to_text(struct printbuf *out,
                                 struct bch_replicas_entry_v1 *e)
 {
-       unsigned i;
-
-       if (e->data_type < BCH_DATA_NR)
-               prt_printf(out, "%s", bch2_data_types[e->data_type]);
-       else
-               prt_printf(out, "(invalid data type %u)", e->data_type);
+       bch2_prt_data_type(out, e->data_type);
 
        prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
-       for (i = 0; i < e->nr_devs; i++)
+       for (unsigned i = 0; i < e->nr_devs; i++)
                prt_printf(out, i ? " %u" : "%u", e->devs[i]);
        prt_printf(out, "]");
 }
@@ -831,7 +827,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
        sort_cmp_size(cpu_r->entries,
                      cpu_r->nr,
                      cpu_r->entry_size,
-                     memcmp, NULL);
+                     bch2_memcmp, NULL);
 
        for (i = 0; i < cpu_r->nr; i++) {
                struct bch_replicas_entry_v1 *e =
index 9632f36f5f318134065cfdbae613b422cce98f6a..b6bf0ebe7e84046a5d08ade7d34bae9ae0bff3a5 100644 (file)
@@ -207,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = BCH_FS_USAGE_inodes;
-               u->v            = cpu_to_le64(c->usage_base->nr_inodes);
+               u->v            = cpu_to_le64(c->usage_base->b.nr_inodes);
        }
 
        {
similarity index 99%
rename from libbcachefs/counters.c
rename to libbcachefs/sb-counters.c
index 02a996e06a64e3d10483f7fcbffc0de66428f9ed..7dc898761bb3125a79c82a5de17de0807920d98d 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
 
 /* BCH_SB_FIELD_counters */
 
similarity index 77%
rename from libbcachefs/counters.h
rename to libbcachefs/sb-counters.h
index 4778aa19bf346459c5ca252e6b75279503867f43..81f8aec9fcb1cedf43143f269fdfc1b6fb39e441 100644 (file)
@@ -1,11 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
 
 #include "bcachefs.h"
 #include "super-io.h"
 
-
 int bch2_sb_counters_to_cpu(struct bch_fs *);
 int bch2_sb_counters_from_cpu(struct bch_fs *);
 
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
 
 extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
 
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/libbcachefs/sb-counters_format.h b/libbcachefs/sb-counters_format.h
new file mode 100644 (file)
index 0000000..62ea478
--- /dev/null
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS()                              \
+       x(io_read,                                      0)      \
+       x(io_write,                                     1)      \
+       x(io_move,                                      2)      \
+       x(bucket_invalidate,                            3)      \
+       x(bucket_discard,                               4)      \
+       x(bucket_alloc,                                 5)      \
+       x(bucket_alloc_fail,                            6)      \
+       x(btree_cache_scan,                             7)      \
+       x(btree_cache_reap,                             8)      \
+       x(btree_cache_cannibalize,                      9)      \
+       x(btree_cache_cannibalize_lock,                 10)     \
+       x(btree_cache_cannibalize_lock_fail,            11)     \
+       x(btree_cache_cannibalize_unlock,               12)     \
+       x(btree_node_write,                             13)     \
+       x(btree_node_read,                              14)     \
+       x(btree_node_compact,                           15)     \
+       x(btree_node_merge,                             16)     \
+       x(btree_node_split,                             17)     \
+       x(btree_node_rewrite,                           18)     \
+       x(btree_node_alloc,                             19)     \
+       x(btree_node_free,                              20)     \
+       x(btree_node_set_root,                          21)     \
+       x(btree_path_relock_fail,                       22)     \
+       x(btree_path_upgrade_fail,                      23)     \
+       x(btree_reserve_get_fail,                       24)     \
+       x(journal_entry_full,                           25)     \
+       x(journal_full,                                 26)     \
+       x(journal_reclaim_finish,                       27)     \
+       x(journal_reclaim_start,                        28)     \
+       x(journal_write,                                29)     \
+       x(read_promote,                                 30)     \
+       x(read_bounce,                                  31)     \
+       x(read_split,                                   33)     \
+       x(read_retry,                                   32)     \
+       x(read_reuse_race,                              34)     \
+       x(move_extent_read,                             35)     \
+       x(move_extent_write,                            36)     \
+       x(move_extent_finish,                           37)     \
+       x(move_extent_fail,                             38)     \
+       x(move_extent_start_fail,                       39)     \
+       x(copygc,                                       40)     \
+       x(copygc_wait,                                  41)     \
+       x(gc_gens_end,                                  42)     \
+       x(gc_gens_start,                                43)     \
+       x(trans_blocked_journal_reclaim,                44)     \
+       x(trans_restart_btree_node_reused,              45)     \
+       x(trans_restart_btree_node_split,               46)     \
+       x(trans_restart_fault_inject,                   47)     \
+       x(trans_restart_iter_upgrade,                   48)     \
+       x(trans_restart_journal_preres_get,             49)     \
+       x(trans_restart_journal_reclaim,                50)     \
+       x(trans_restart_journal_res_get,                51)     \
+       x(trans_restart_key_cache_key_realloced,        52)     \
+       x(trans_restart_key_cache_raced,                53)     \
+       x(trans_restart_mark_replicas,                  54)     \
+       x(trans_restart_mem_realloced,                  55)     \
+       x(trans_restart_memory_allocation_failure,      56)     \
+       x(trans_restart_relock,                         57)     \
+       x(trans_restart_relock_after_fill,              58)     \
+       x(trans_restart_relock_key_cache_fill,          59)     \
+       x(trans_restart_relock_next_node,               60)     \
+       x(trans_restart_relock_parent_for_fill,         61)     \
+       x(trans_restart_relock_path,                    62)     \
+       x(trans_restart_relock_path_intent,             63)     \
+       x(trans_restart_too_many_iters,                 64)     \
+       x(trans_restart_traverse,                       65)     \
+       x(trans_restart_upgrade,                        66)     \
+       x(trans_restart_would_deadlock,                 67)     \
+       x(trans_restart_would_deadlock_write,           68)     \
+       x(trans_restart_injected,                       69)     \
+       x(trans_restart_key_cache_upgrade,              70)     \
+       x(trans_traverse_all,                           71)     \
+       x(transaction_commit,                           72)     \
+       x(write_super,                                  73)     \
+       x(trans_restart_would_deadlock_recursion_limit, 74)     \
+       x(trans_restart_write_buffer_flush,             75)     \
+       x(trans_restart_split_race,                     76)     \
+       x(write_buffer_flush_slowpath,                  77)     \
+       x(write_buffer_flush_sync,                      78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+       BCH_PERSISTENT_COUNTERS()
+#undef x
+       BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+       struct bch_sb_field     field;
+       __le64                  d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
index a44a238bf8b5550023226844734424b1211c812a..a45354d2acde9f3ad0b149247c8ff4c7c869fb15 100644 (file)
@@ -251,7 +251,7 @@ static void member_to_text(struct printbuf *out,
        prt_printf(out, "Data allowed:");
        prt_tab(out);
        if (BCH_MEMBER_DATA_ALLOWED(&m))
-               prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+               prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
        else
                prt_printf(out, "(none)");
        prt_newline(out);
@@ -259,7 +259,7 @@ static void member_to_text(struct printbuf *out,
        prt_printf(out, "Has data:");
        prt_tab(out);
        if (data_have)
-               prt_bitflags(out, bch2_data_types, data_have);
+               prt_bitflags(out, __bch2_data_types, data_have);
        else
                prt_printf(out, "(none)");
        prt_newline(out);
index 56af937523ff2a8deda0a5168f45a67533a57da5..45f67e8b29eb67f188e5cfb32aa39e0b1ad1d625 100644 (file)
@@ -1053,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
                n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
                n->v.tree       = cpu_to_le32(tree);
                n->v.depth      = cpu_to_le32(depth);
+               n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
+               n->v.btime.hi   = 0;
 
                for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
                        n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1681,5 +1683,5 @@ int bch2_snapshots_read(struct bch_fs *c)
 
 void bch2_fs_snapshots_exit(struct bch_fs *c)
 {
-       kfree(rcu_dereference_protected(c->snapshots, true));
+       kvfree(rcu_dereference_protected(c->snapshots, true));
 }
diff --git a/libbcachefs/snapshot_format.h b/libbcachefs/snapshot_format.h
new file mode 100644 (file)
index 0000000..aabcd3a
--- /dev/null
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  parent;
+       __le32                  children[2];
+       __le32                  subvol;
+       /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+       __le32                  tree;
+       __le32                  depth;
+       __le32                  skip[3];
+       bch_le128               btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,     struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+       struct bch_val          v;
+       __le32                  master_subvol;
+       __le32                  root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/libbcachefs/subvolume_format.h b/libbcachefs/subvolume_format.h
new file mode 100644 (file)
index 0000000..af79134
--- /dev/null
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN         POS(0, 1)
+#define SUBVOL_POS_MAX         POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL   1
+
+struct bch_subvolume {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  snapshot;
+       __le64                  inode;
+       /*
+        * Snapshot subvolumes form a tree, separate from the snapshot nodes
+        * tree - if this subvolume is a snapshot, this is the ID of the
+        * subvolume it was created from:
+        *
+        * This is _not_ necessarily the subvolume of the directory containing
+        * this subvolume:
+        */
+       __le32                  parent;
+       __le32                  pad;
+       bch_le128               otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,         struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,   struct bch_subvolume, flags,  2,  3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
index 55926b81eede63596f7a48f6daaa363bede4d504..d60c7d27a0477cb0de116675671d5c888d8f1c86 100644 (file)
@@ -2,7 +2,6 @@
 
 #include "bcachefs.h"
 #include "checksum.h"
-#include "counters.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
@@ -13,6 +12,7 @@
 #include "replicas.h"
 #include "quota.h"
 #include "sb-clean.h"
+#include "sb-counters.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "sb-members.h"
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
 void bch2_free_super(struct bch_sb_handle *sb)
 {
        kfree(sb->bio);
-       if (!IS_ERR_OR_NULL(sb->bdev))
-               blkdev_put(sb->bdev, sb->holder);
+       if (!IS_ERR_OR_NULL(sb->bdev_handle))
+               bdev_release(sb->bdev_handle);
        kfree(sb->holder);
        kfree(sb->sb_name);
 
@@ -704,21 +704,22 @@ retry:
        if (!opt_get(*opts, nochanges))
                sb->mode |= BLK_OPEN_WRITE;
 
-       sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-       if (IS_ERR(sb->bdev) &&
-           PTR_ERR(sb->bdev) == -EACCES &&
+       sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+       if (IS_ERR(sb->bdev_handle) &&
+           PTR_ERR(sb->bdev_handle) == -EACCES &&
            opt_get(*opts, read_only)) {
                sb->mode &= ~BLK_OPEN_WRITE;
 
-               sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-               if (!IS_ERR(sb->bdev))
+               sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+               if (!IS_ERR(sb->bdev_handle))
                        opt_set(*opts, nochanges, true);
        }
 
-       if (IS_ERR(sb->bdev)) {
-               ret = PTR_ERR(sb->bdev);
+       if (IS_ERR(sb->bdev_handle)) {
+               ret = PTR_ERR(sb->bdev_handle);
                goto out;
        }
+       sb->bdev = sb->bdev_handle->bdev;
 
        ret = bch2_sb_realloc(sb, 0);
        if (ret) {
@@ -1320,7 +1321,9 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
 
        prt_printf(out, "Superblock size:");
        prt_tab(out);
-       prt_printf(out, "%zu", vstruct_bytes(sb));
+       prt_units_u64(out, vstruct_bytes(sb));
+       prt_str(out, "/");
+       prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
        prt_newline(out);
 
        prt_printf(out, "Clean:");
index 9dbc35940197f1c55c1bc48746bc23a3983ac203..da8697c79a97e7d2c2f35056b707103d65618c5b 100644 (file)
@@ -23,7 +23,6 @@
 #include "checksum.h"
 #include "clock.h"
 #include "compress.h"
-#include "counters.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
@@ -49,6 +48,7 @@
 #include "recovery.h"
 #include "replicas.h"
 #include "sb-clean.h"
+#include "sb-counters.h"
 #include "sb-errors.h"
 #include "sb-members.h"
 #include "snapshot.h"
@@ -862,13 +862,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
        if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-                               WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
            !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
-                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
            !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
-                               WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+                               WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
            !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
                                WQ_FREEZABLE, 0)) ||
 #ifndef BCH_WRITE_REF_DEBUG
@@ -883,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-                                       btree_bytes(c)) ||
+                                       c->opts.btree_node_size) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
            !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
                                              sizeof(u64), GFP_KERNEL))) {
@@ -1386,8 +1386,8 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
        prt_bdevname(&name, ca->disk_sb.bdev);
 
        if (c->sb.nr_devices == 1)
-               strlcpy(c->name, name.buf, sizeof(c->name));
-       strlcpy(ca->name, name.buf, sizeof(ca->name));
+               strscpy(c->name, name.buf, sizeof(c->name));
+       strscpy(ca->name, name.buf, sizeof(ca->name));
 
        printbuf_exit(&name);
 
@@ -1625,7 +1625,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
        if (data) {
                struct printbuf data_has = PRINTBUF;
 
-               prt_bitflags(&data_has, bch2_data_types, data);
+               prt_bitflags(&data_has, __bch2_data_types, data);
                bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
                printbuf_exit(&data_has);
                ret = -EBUSY;
index 87d159b9b8cd663fcaa8d2218a2f64190bdea490..0e5a14fc8e7fbfde622ec68dfae45f69ad83bd87 100644 (file)
@@ -4,6 +4,7 @@
 
 struct bch_sb_handle {
        struct bch_sb           *sb;
+       struct bdev_handle      *bdev_handle;
        struct block_device     *bdev;
        char                    *sb_name;
        struct bio              *bio;
index 8ed52319ff68d2b93194970b7da51218a579b0dd..cee80c47feea2b27fa7d18fc55a39228db7f0b96 100644 (file)
@@ -21,6 +21,7 @@
 #include "btree_gc.h"
 #include "buckets.h"
 #include "clock.h"
+#include "compress.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "inode.h"
@@ -247,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
        mutex_lock(&c->btree_cache.lock);
        list_for_each_entry(b, &c->btree_cache.live, list)
-               ret += btree_bytes(c);
+               ret += btree_buf_bytes(b);
 
        mutex_unlock(&c->btree_cache.lock);
        return ret;
@@ -330,7 +331,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
        prt_newline(out);
 
        for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
-               prt_str(out, bch2_compression_types[i]);
+               bch2_prt_compression_type(out, i);
                prt_tab(out);
 
                prt_human_readable_u64(out, s[i].sectors_compressed << 9);
@@ -725,8 +726,10 @@ STORE(bch2_fs_opts_dir)
        bch2_opt_set_sb(c, opt, v);
        bch2_opt_set_by_id(&c->opts, id, v);
 
-       if ((id == Opt_background_target ||
-            id == Opt_background_compression) && v)
+       if (v &&
+           (id == Opt_background_target ||
+            id == Opt_background_compression ||
+            (id == Opt_compression && !c->opts.background_compression)))
                bch2_set_rebalance_needs_scan(c, 0);
 
        ret = size;
@@ -883,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
 
                for (i = 1; i < BCH_DATA_NR; i++)
                        prt_printf(out, "%-12s:%12llu\n",
-                              bch2_data_types[i],
+                              bch2_data_type_str(i),
                               percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
        }
 }
@@ -908,7 +911,7 @@ SHOW(bch2_dev)
        }
 
        if (attr == &sysfs_has_data) {
-               prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+               prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
                prt_char(out, '\n');
        }
 
index c94876b3bb06e4d8bf0ba490421ead37d87e5569..293b90d704fb5b48ed39038e793c4d3cbf77b5a8 100644 (file)
@@ -46,7 +46,7 @@ DECLARE_EVENT_CLASS(fs_str,
                __assign_str(str, str);
        ),
 
-       TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
+       TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
 );
 
 DECLARE_EVENT_CLASS(trans_str,
@@ -273,28 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(bch_fs, journal_entry_full,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
-TRACE_EVENT(journal_entry_close,
-       TP_PROTO(struct bch_fs *c, unsigned bytes),
-       TP_ARGS(c, bytes),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u32,            bytes                   )
-       ),
-
-       TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->bytes                  = bytes;
-       ),
-
-       TP_printk("%d,%d entry bytes %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->bytes)
+DEFINE_EVENT(fs_str, journal_entry_close,
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
 DEFINE_EVENT(bio, journal_write,
@@ -542,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
                __entry->level                  = path->level;
                TRACE_BPOS_assign(pos, path->pos);
 
-               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
                __entry->self_read_count        = c.n[SIX_LOCK_read];
                __entry->self_intent_count      = c.n[SIX_LOCK_intent];
 
@@ -827,40 +813,28 @@ TRACE_EVENT(bucket_evacuate,
 );
 
 DEFINE_EVENT(fs_str, move_extent,
-       TP_PROTO(struct bch_fs *c, const char *k),
-       TP_ARGS(c, k)
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
 DEFINE_EVENT(fs_str, move_extent_read,
-       TP_PROTO(struct bch_fs *c, const char *k),
-       TP_ARGS(c, k)
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
 DEFINE_EVENT(fs_str, move_extent_write,
-       TP_PROTO(struct bch_fs *c, const char *k),
-       TP_ARGS(c, k)
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
 DEFINE_EVENT(fs_str, move_extent_finish,
-       TP_PROTO(struct bch_fs *c, const char *k),
-       TP_ARGS(c, k)
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
-TRACE_EVENT(move_extent_fail,
-       TP_PROTO(struct bch_fs *c, const char *msg),
-       TP_ARGS(c, msg),
-
-       TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __string(msg,           msg                     )
-       ),
-
-       TP_fast_assign(
-               __entry->dev            = c->dev;
-               __assign_str(msg, msg);
-       ),
-
-       TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+       TP_PROTO(struct bch_fs *c, const char *str),
+       TP_ARGS(c, str)
 );
 
 DEFINE_EVENT(fs_str, move_extent_start_fail,
@@ -1039,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
                __entry->level          = b->c.level;
                __entry->written        = b->written;
                __entry->blocks         = btree_blocks(trans->c);
-               __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+               __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
        ),
 
        TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -1146,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter,    trans_restart_btree_node_split,
        TP_ARGS(trans, caller_ip, path)
 );
 
-struct get_locks_fail;
-
 TRACE_EVENT(trans_restart_upgrade,
        TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
@@ -1195,11 +1167,9 @@ TRACE_EVENT(trans_restart_upgrade,
                  __entry->node_seq)
 );
 
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
-       TP_PROTO(struct btree_trans *trans,
-                unsigned long caller_ip,
-                struct btree_path *path),
-       TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str,        trans_restart_relock,
+       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+       TP_ARGS(trans, caller_ip, str)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
index c2ef7cddaa4fcb0e9de9df263aadd019cc7a4965..56b815fd9fc6ee5a541aa8e7007f3c00025c493d 100644 (file)
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
        return true;
 }
 
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
 {
        while (nr_bits)
                prt_char(out, '0' + ((v >> --nr_bits) & 1));
 }
 
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+       bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
 void bch2_print_string_as_lines(const char *prefix, const char *lines)
 {
        const char *p;
@@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
        console_unlock();
 }
 
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+                       gfp_t gfp)
 {
 #ifdef CONFIG_STACKTRACE
        unsigned nr_entries = 0;
-       int ret = 0;
 
        stack->nr = 0;
-       ret = darray_make_room(stack, 32);
+       int ret = darray_make_room_gfp(stack, 32, gfp);
        if (ret)
                return ret;
 
@@ -303,10 +308,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
        }
 }
 
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
 {
        bch_stacktrace stack = { 0 };
-       int ret = bch2_save_backtrace(&stack, task, skipnr + 1);
+       int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
 
        bch2_prt_backtrace(out, &stack);
        darray_exit(&stack);
@@ -1186,7 +1191,9 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
 {
        darray_init(ret);
 
-       char *dev_name = kstrdup(_dev_name, GFP_KERNEL), *s = dev_name;
+       char *dev_name, *s, *orig;
+
+       dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
        if (!dev_name)
                return -ENOMEM;
 
@@ -1201,10 +1208,10 @@ int bch2_split_devs(const char *_dev_name, darray_str *ret)
                }
        }
 
-       kfree(dev_name);
+       kfree(orig);
        return 0;
 err:
        bch2_darray_str_exit(ret);
-       kfree(dev_name);
+       kfree(orig);
        return -ENOMEM;
 }
index c75fc31915d3936d8c0a26949915534aac482b3a..b414736d59a5b36d1344657eaeb6de6113ec5a09 100644 (file)
@@ -342,14 +342,15 @@ bool bch2_is_zero(const void *, size_t);
 
 u64 bch2_read_flag_list(char *, const char * const[]);
 
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
 
 void bch2_print_string_as_lines(const char *prefix, const char *lines);
 
 typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
 void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
 
 static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 {
index 5a1858fb9879afd1c70c3d5a64883315090d6dbe..9c0d2316031b1beceda4e1b68dcda4e34184a89e 100644 (file)
@@ -590,8 +590,9 @@ err:
        mutex_unlock(&inode->ei_update_lock);
 
        if (value &&
-           (opt_id == Opt_background_compression ||
-            opt_id == Opt_background_target))
+           (opt_id == Opt_background_target ||
+            opt_id == Opt_background_compression ||
+            (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
                bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
 
        return bch2_err_class(ret);
diff --git a/libbcachefs/xattr_format.h b/libbcachefs/xattr_format.h
new file mode 100644 (file)
index 0000000..e9f8105
--- /dev/null
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER              0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS  1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED           3
+#define KEY_TYPE_XATTR_INDEX_SECURITY          4
+
+struct bch_xattr {
+       struct bch_val          v;
+       __u8                    x_type;
+       __u8                    x_name_len;
+       __le16                  x_val_len;
+       __u8                    x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
index 61f23362d016375330c312a854a85f69aac03598..b7f58737298fa7280f1c949312486c363f85ad14 100644 (file)
@@ -162,17 +162,17 @@ sector_t get_capacity(struct gendisk *disk)
        return bytes >> 9;
 }
 
-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
 {
-       fdatasync(bdev->bd_fd);
-       close(bdev->bd_fd);
-       free(bdev);
+       fdatasync(handle->bdev->bd_fd);
+       close(handle->bdev->bd_fd);
+       free(handle->bdev);
+       free(handle);
 }
 
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
-                                       void *holder, const struct blk_holder_ops *hop)
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+                                     void *holder, const struct blk_holder_ops *hop)
 {
-       struct block_device *bdev;
        int fd, flags = 0;
 
        if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
@@ -192,7 +192,7 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
        if (fd < 0)
                return ERR_PTR(-errno);
 
-       bdev = malloc(sizeof(*bdev));
+       struct block_device *bdev = malloc(sizeof(*bdev));
        memset(bdev, 0, sizeof(*bdev));
 
        strncpy(bdev->name, path, sizeof(bdev->name));
@@ -205,12 +205,12 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
        bdev->bd_disk->bdi      = &bdev->bd_disk->__bdi;
        bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
 
-       return bdev;
-}
+       struct bdev_handle *handle = calloc(sizeof(*handle), 1);
+       handle->bdev    = bdev;
+       handle->holder  = holder;
+       handle->mode    = mode;
 
-void bdput(struct block_device *bdev)
-{
-       BUG();
+       return handle;
 }
 
 int lookup_bdev(const char *path, dev_t *dev)