]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 0906b1fb49 bcachefs: fixes for 32 bit/big endian machines
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 17 May 2018 05:38:57 +0000 (01:38 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 17 May 2018 06:36:19 +0000 (02:36 -0400)
65 files changed:
.bcachefs_revision
bcachefs.c
cmd_assemble.c
cmd_debug.c
cmd_run.c
cmds.h
include/linux/timer.h
include/linux/workqueue.h
libbcachefs/alloc.c
libbcachefs/alloc.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/checksum.c
libbcachefs/clock.c
libbcachefs/clock.h
libbcachefs/compress.c
libbcachefs/disk_groups.c
libbcachefs/disk_groups.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fsck.c
libbcachefs/io.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/keylist.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/move_types.h [new file with mode: 0644]
libbcachefs/movinggc.c
libbcachefs/rebalance.c [new file with mode: 0644]
libbcachefs/rebalance.h [moved from libbcachefs/tier.h with 65% similarity]
libbcachefs/rebalance_types.h [new file with mode: 0644]
libbcachefs/six.c
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/tier.c [deleted file]
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/xattr.c
linux/sched.c
linux/timer.c
linux/workqueue.c

index 37d51b2f0a98fe914055417a497c8fdceec39540..e267faa6dfd771855a794b0865cce2c1b1571849 100644 (file)
@@ -1 +1 @@
-ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
+0906b1fb492e8e84f563b192fd8f458af1c1d420
index 53806f39f15ad42d5bf5ef3c27a0feebdc1cb469..1c56ead736394cb0ea3f1bf9249b69b35bd41539 100644 (file)
@@ -36,10 +36,12 @@ static void usage(void)
             "  fsck                 Check an existing filesystem for errors\n"
             "\n"
             "Startup/shutdown, assembly of multi device filesystems:\n"
+#if 0
             "  assemble             Assemble an existing multi device filesystem\n"
             "  incremental          Incrementally assemble an existing multi device filesystem\n"
             "  run                  Start a partially assembled filesystem\n"
             "  stop                   Stop a running filesystem\n"
+#endif
             "\n"
             "Commands for managing a running filesystem:\n"
             "  fs usage             Show disk usage\n"
@@ -150,6 +152,7 @@ int main(int argc, char *argv[])
        if (!strcmp(cmd, "fsck"))
                return cmd_fsck(argc, argv);
 
+#if 0
        if (!strcmp(cmd, "assemble"))
                return cmd_assemble(argc, argv);
        if (!strcmp(cmd, "incremental"))
@@ -158,6 +161,7 @@ int main(int argc, char *argv[])
                return cmd_run(argc, argv);
        if (!strcmp(cmd, "stop"))
                return cmd_stop(argc, argv);
+#endif
 
        if (!strcmp(cmd, "fs"))
                return fs_cmds(argc, argv);
index 57b28026d9aa8710bded6ff46b046cf5e4c665e5..a997e1e1e900f6c00824791b0c321c738558cd86 100644 (file)
@@ -11,6 +11,7 @@
 #include "cmds.h"
 #include "libbcachefs.h"
 
+#if 0
 int cmd_assemble(int argc, char *argv[])
 {
        unsigned nr_devs = argc - 1;
@@ -26,7 +27,7 @@ int cmd_assemble(int argc, char *argv[])
 
        unsigned i;
        for (i = 0; i < nr_devs; i++)
-               assemble->devs[i] = (__u64) argv[i + 1];
+               assemble->devs[i] = (unsigned long) argv[i + 1];
 
        xioctl(bcachectl_open(), BCH_IOCTL_ASSEMBLE, assemble);
        return 0;
@@ -38,9 +39,10 @@ int cmd_incremental(int argc, char *argv[])
                die("Please supply exactly one device");
 
        struct bch_ioctl_incremental incremental = {
-               .dev = (__u64) argv[1],
+               .dev = (unsigned long) argv[1],
        };
 
        xioctl(bcachectl_open(), BCH_IOCTL_INCREMENTAL, &incremental);
        return 0;
 }
+#endif
index 6c2b3184c36dbedcdcf95d3634d6a37674018feb..11d73b35b6eba653242448154317c1198ff71d65 100644 (file)
@@ -10,6 +10,7 @@
 
 #include "libbcachefs/bcachefs.h"
 #include "libbcachefs/alloc.h"
+#include "libbcachefs/bset.h"
 #include "libbcachefs/btree_cache.h"
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/buckets.h"
index 673d519ab8ebfe5d663d52d9e6e22981dce83d75..1bf84e5cdfd7ec5721175f0e175cff4068bddc97 100644 (file)
--- a/cmd_run.c
+++ b/cmd_run.c
@@ -15,6 +15,7 @@
 #include "cmds.h"
 #include "libbcachefs.h"
 
+#if 0
 int cmd_run(int argc, char *argv[])
 {
        return 0;
@@ -29,3 +30,4 @@ int cmd_stop(int argc, char *argv[])
        xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
        return 0;
 }
+#endif
diff --git a/cmds.h b/cmds.h
index 6d21db6f0ad5349acc7f19d7de2a6ede7a971f9b..258a823d04669c7ef4fe13838c347ff5f58daa4d 100644 (file)
--- a/cmds.h
+++ b/cmds.h
 int cmd_format(int argc, char *argv[]);
 int cmd_show_super(int argc, char *argv[]);
 
+#if 0
 int cmd_assemble(int argc, char *argv[]);
 int cmd_incremental(int argc, char *argv[]);
 int cmd_run(int argc, char *argv[]);
 int cmd_stop(int argc, char *argv[]);
+#endif
 
 int cmd_fs_usage(int argc, char *argv[]);
 
index 363f26a44f50e9d51a12265d3e7a3e3c511e9ea3..9667acf9d70a7694c40ca4e38537a3db79efd7f5 100644 (file)
@@ -6,27 +6,22 @@
 
 struct timer_list {
        unsigned long           expires;
-       void                    (*function)(unsigned long);
-       unsigned long           data;
+       void                    (*function)(struct timer_list *timer);
        bool                    pending;
 };
 
-static inline void init_timer(struct timer_list *timer)
+static inline void timer_setup(struct timer_list *timer,
+                              void (*func)(struct timer_list *),
+                              unsigned int flags)
 {
        memset(timer, 0, sizeof(*timer));
+       timer->function = func;
 }
 
-#define __init_timer(_timer, _flags)   init_timer(_timer)
+#define timer_setup_on_stack(timer, callback, flags)                   \
+       timer_setup(timer, callback, flags)
 
-#define __setup_timer(_timer, _fn, _data, _flags)                      \
-       do {                                                            \
-               __init_timer((_timer), (_flags));                       \
-               (_timer)->function = (_fn);                             \
-               (_timer)->data = (_data);                               \
-       } while (0)
-
-#define setup_timer(timer, fn, data)                                   \
-       __setup_timer((timer), (fn), (data), 0)
+#define destroy_timer_on_stack(timer) do {} while (0)
 
 static inline int timer_pending(const struct timer_list *timer)
 {
@@ -36,8 +31,9 @@ static inline int timer_pending(const struct timer_list *timer)
 int del_timer(struct timer_list * timer);
 int del_timer_sync(struct timer_list *timer);
 
+#define del_singleshot_timer_sync(timer) del_timer_sync(timer)
+
 int mod_timer(struct timer_list *timer, unsigned long expires);
-//extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
 static inline void add_timer(struct timer_list *timer)
 {
index 213562f2201fcc66903e06bf0f55e4afb303efc2..1406c9582c859c2627653fae54a7d47bc22480d3 100644 (file)
@@ -8,7 +8,7 @@ struct task_struct;
 struct workqueue_struct;
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
+void delayed_work_timer_fn(struct timer_list *);
 
 #define work_data_bits(work) ((unsigned long *)(&(work)->data))
 
@@ -44,9 +44,7 @@ struct delayed_work {
 #define INIT_DELAYED_WORK(_work, _func)                                        \
        do {                                                            \
                INIT_WORK(&(_work)->work, (_func));                     \
-               __setup_timer(&(_work)->timer, delayed_work_timer_fn,   \
-                             (unsigned long)(_work),                   \
-                             TIMER_IRQSAFE);                           \
+               timer_setup(&(_work)->timer, delayed_work_timer_fn, 0); \
        } while (0)
 
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
index 256adb51510167faca1142a1831611c9b7538cc7..44f9479eecd19d645cd75f09a3882452e7cd2b3f 100644 (file)
@@ -1393,12 +1393,10 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
 {
        int i;
 
-       for (i = wp->first_ptr - 1; i >= 0; --i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, wp->ptrs[i]->ptr.dev);
-
-               if (dev_in_target(ca, target) == in_target)
+       for (i = wp->first_ptr - 1; i >= 0; --i)
+               if (bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+                                      target) == in_target)
                        writepoint_drop_ptr(c, wp, i);
-       }
 }
 
 static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
@@ -1555,7 +1553,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
        /* does writepoint have ptrs we don't want to use? */
        if (target)
                writepoint_for_each_ptr(wp, ob, i)
-                       if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+                       if (!bch2_dev_in_target(c, ob->ptr.dev, target)) {
                                swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
                                wp->first_ptr++;
                        }
@@ -1590,7 +1588,8 @@ alloc_done:
                 * one in the target we want:
                 */
                if (cache_idx >= 0) {
-                       if (!dev_in_target(ca, target)) {
+                       if (!bch2_dev_in_target(c, wp->ptrs[i]->ptr.dev,
+                                               target)) {
                                writepoint_drop_ptr(c, wp, i);
                        } else {
                                writepoint_drop_ptr(c, wp, cache_idx);
@@ -1621,7 +1620,7 @@ alloc_done:
 
                        if (ca->mi.durability &&
                            ca->mi.durability <= nr_ptrs_effective - nr_replicas &&
-                           !dev_idx_in_target(c, ob->ptr.dev, target)) {
+                           !bch2_dev_in_target(c, ob->ptr.dev, target)) {
                                swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
                                wp->first_ptr++;
                                nr_ptrs_effective -= ca->mi.durability;
@@ -1890,8 +1889,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 /* stop allocator thread: */
 void bch2_dev_allocator_stop(struct bch_dev *ca)
 {
-       struct task_struct *p = ca->alloc_thread;
+       struct task_struct *p;
 
+       p = rcu_dereference_protected(ca->alloc_thread, 1);
        ca->alloc_thread = NULL;
 
        /*
@@ -1926,7 +1926,7 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
                return PTR_ERR(p);
 
        get_task_struct(p);
-       ca->alloc_thread = p;
+       rcu_assign_pointer(ca->alloc_thread, p);
        wake_up_process(p);
        return 0;
 }
@@ -2099,7 +2099,7 @@ again:
                        if (btree_node_dirty(b) && (!b->written || b->level)) {
                                if (btree_node_may_write(b)) {
                                        rcu_read_unlock();
-                                       six_lock_read(&b->lock);
+                                       btree_node_lock_type(c, b, SIX_LOCK_read);
                                        bch2_btree_node_write(c, b, SIX_LOCK_read);
                                        six_unlock_read(&b->lock);
                                        goto again;
index 372cc047e92749a0f3c2414e07313ee350f48f09..00d01f464c68f3a95c30883f076cb82eeede8aa5 100644 (file)
@@ -103,7 +103,8 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
        struct task_struct *p;
 
        rcu_read_lock();
-       if ((p = READ_ONCE(ca->alloc_thread)))
+       p = rcu_dereference(ca->alloc_thread);
+       if (p)
                wake_up_process(p);
        rcu_read_unlock();
 }
index 206c30f4b51699db1860952e368044b1a119bf95..879bde20bca4ca4a37e65d51d8768b00b3d56838 100644 (file)
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
-#include "bset.h"
 #include "fifo.h"
 #include "opts.h"
 #include "util.h"
@@ -271,26 +270,38 @@ do {                                                                      \
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
-#define BCH_TIME_STATS()                               \
-       BCH_TIME_STAT(btree_node_mem_alloc)             \
-       BCH_TIME_STAT(btree_gc)                         \
-       BCH_TIME_STAT(btree_split)                      \
-       BCH_TIME_STAT(btree_sort)                       \
-       BCH_TIME_STAT(btree_read)                       \
-       BCH_TIME_STAT(data_write)                       \
-       BCH_TIME_STAT(data_read)                        \
-       BCH_TIME_STAT(data_promote)                     \
-       BCH_TIME_STAT(journal_write)                    \
-       BCH_TIME_STAT(journal_delay)                    \
-       BCH_TIME_STAT(journal_blocked)                  \
-       BCH_TIME_STAT(journal_flush_seq)
+#define BCH_TIME_STATS()                       \
+       x(btree_node_mem_alloc)                 \
+       x(btree_gc)                             \
+       x(btree_split)                          \
+       x(btree_sort)                           \
+       x(btree_read)                           \
+       x(btree_lock_contended_read)            \
+       x(btree_lock_contended_intent)          \
+       x(btree_lock_contended_write)           \
+       x(data_write)                           \
+       x(data_read)                            \
+       x(data_promote)                         \
+       x(journal_write)                        \
+       x(journal_delay)                        \
+       x(journal_blocked)                      \
+       x(journal_flush_seq)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+       BCH_TIME_STATS()
+#undef x
+       BCH_TIME_STAT_NR
+};
 
 #include "alloc_types.h"
+#include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
+#include "rebalance_types.h"
 #include "super_types.h"
 
 /*
@@ -372,7 +383,7 @@ struct bch_dev {
        struct bch_dev_usage    usage_cached;
 
        /* Allocator: */
-       struct task_struct      *alloc_thread;
+       struct task_struct __rcu *alloc_thread;
 
        /*
         * free: Buckets that are ready to be used
@@ -447,7 +458,6 @@ enum {
        /* shutdown: */
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
-       BCH_FS_GC_STOPPING,
 
        /* errors: */
        BCH_FS_ERROR,
@@ -570,12 +580,6 @@ struct bch_fs {
        struct delayed_work     pd_controllers_update;
        unsigned                pd_controllers_update_seconds;
 
-       /* REBALANCE */
-       struct task_struct      *rebalance_thread;
-       struct bch_pd_controller rebalance_pd;
-
-       atomic64_t              rebalance_work_unknown_dev;
-
        struct bch_devs_mask    rw_devs[BCH_DATA_NR];
 
        u64                     capacity; /* sectors */
@@ -664,6 +668,9 @@ struct bch_fs {
 
        atomic64_t              key_version;
 
+       /* REBALANCE */
+       struct bch_fs_rebalance rebalance;
+
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
        struct bio_set          dio_write_bioset;
@@ -714,18 +721,13 @@ struct bch_fs {
 
        unsigned                btree_gc_periodic:1;
        unsigned                copy_gc_enabled:1;
-       unsigned                rebalance_enabled:1;
-       unsigned                rebalance_percent;
        bool                    promote_whole_extents;
 
 #define BCH_DEBUG_PARAM(name, description) bool name;
        BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM
 
-#define BCH_TIME_STAT(name)                            \
-       struct time_stats       name##_time;
-       BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+       struct time_stats       times[BCH_TIME_STAT_NR];
 };
 
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
index 48d14a30e03e7ed81b1c51a00e59f0c576bc3d3c..ab8b944634e87387648884cd679b37ff374e2fbf 100644 (file)
@@ -3,6 +3,72 @@
 
 /*
  * bcachefs on disk data structures
+ *
+ * OVERVIEW:
+ *
+ * There are three main types of on disk data structures in bcachefs (this is
+ * reduced from 5 in bcache)
+ *
+ *  - superblock
+ *  - journal
+ *  - btree
+ *
+ * The btree is the primary structure; most metadata exists as keys in the
+ * various btrees. There are only a small number of btrees, they're not
+ * sharded - we have one btree for extents, another for inodes, et cetera.
+ *
+ * SUPERBLOCK:
+ *
+ * The superblock contains the location of the journal, the list of devices in
+ * the filesystem, and in general any metadata we need in order to decide
+ * whether we can start a filesystem or prior to reading the journal/btree
+ * roots.
+ *
+ * The superblock is extensible, and most of the contents of the superblock are
+ * in variable length, type tagged fields; see struct bch_sb_field.
+ *
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
+ * not have a fixed size. To locate backup superblocks we have struct
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
+ * before the first superblock.
+ *
+ * JOURNAL:
+ *
+ * The journal primarily records btree updates in the order they occurred;
+ * journal replay consists of just iterating over all the keys in the open
+ * journal entries and re-inserting them into the btrees.
+ *
+ * The journal also contains entry types for the btree roots, and blacklisted
+ * journal sequence numbers (see journal_seq_blacklist.c).
+ *
+ * BTREE:
+ *
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
+ * entry in a given node (offset 0), and struct btree_node_entry for all
+ * subsequent writes.
+ *
+ * After the header, btree node entries contain a list of keys in sorted order.
+ * Values are stored inline with the keys; since values are variable length (and
+ * keys effectively are variable length too, due to packing) we can't do random
+ * access without building up additional in memory tables in the btree node read
+ * path.
+ *
+ * BTREE KEYS (struct bkey):
+ *
+ * The various btrees share a common format for the key - so as to avoid
+ * switching in fastpath lookup/comparison code - but define their own
+ * structures for the key values.
+ *
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
+ * size is just under 2k. The common part also contains a type tag for the
+ * value, and a format field indicating whether the key is packed or not (and
+ * also meant to allow adding new key fields in the future, if desired).
+ *
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
+ * be generous with field sizes in the common part of the key format (64 bit
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
  */
 
 #include <asm/types.h>
@@ -44,12 +110,19 @@ struct bkey_format {
 /* Btree keys - all units are in sectors */
 
 struct bpos {
-       /* Word order matches machine byte order */
-#if defined(__LITTLE_ENDIAN)
+       /*
+        * Word order matches machine byte order - btree code treats a bpos as a
+        * single large integer, for search/comparison purposes
+        *
+        * Note that wherever a bpos is embedded in another on disk data
+        * structure, it has to be byte swabbed when reading in metadata that
+        * wasn't written in native endian order:
+        */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        __u32           snapshot;
        __u64           offset;
        __u64           inode;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        __u64           inode;
        __u64           offset;         /* Points to end of extent - sectors */
        __u32           snapshot;
@@ -83,10 +156,10 @@ struct bch_val {
 };
 
 struct bversion {
-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        __u64           lo;
        __u32           hi;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        __u32           hi;
        __u64           lo;
 #endif
@@ -110,13 +183,13 @@ struct bkey {
        /* Type of the value */
        __u8            type;
 
-#if defined(__LITTLE_ENDIAN)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        __u8            pad[1];
 
        struct bversion version;
        __u32           size;           /* extent size, in sectors */
        struct bpos     p;
-#elif defined(__BIG_ENDIAN)
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        struct bpos     p;
        __u32           size;           /* extent size, in sectors */
        struct bversion version;
@@ -275,10 +348,10 @@ BKEY_VAL_TYPE(cookie,             KEY_TYPE_COOKIE);
  *
  * If an extent is not checksummed or compressed, when the extent is trimmed we
  * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the start of the data that
- * is currently live. The size field in struct bkey records the current (live)
- * size of the extent, and is also used to mean "size of region on disk that we
- * point to" in this case.
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
  *
  * Thus an extent that is not checksummed or compressed will consist only of a
  * list of bch_extent_ptrs, with none of the fields in
@@ -446,11 +519,11 @@ struct bch_extent_crc128 {
 #elif defined (__BIG_ENDIAN_BITFIELD)
        __u64                   compression_type:4,
                                csum_type:4,
-                               nonce:14,
+                               nonce:13,
                                offset:13,
                                _uncompressed_size:13,
                                _compressed_size:13,
-                               type:3;
+                               type:4;
 #endif
        struct bch_csum         csum;
 } __attribute__((packed, aligned(8)));
@@ -496,7 +569,7 @@ struct bch_extent_reservation {
 };
 
 union bch_extent_entry {
-#if defined(__LITTLE_ENDIAN) ||  __BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ||  __BITS_PER_LONG == 64
        unsigned long                   type;
 #elif __BITS_PER_LONG == 32
        struct {
@@ -551,10 +624,11 @@ BKEY_VAL_TYPE(reservation,        BCH_RESERVATION);
          sizeof(struct bch_extent_ptr)) / sizeof(u64))
 
 /* Maximum possible size of an entire extent value: */
-/* There's a hack in the keylist code that needs to be fixed.. */
 #define BKEY_EXTENT_VAL_U64s_MAX                               \
        (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
+#define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
+
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
 
@@ -1378,33 +1452,4 @@ struct btree_node_entry {
        };
 } __attribute__((packed, aligned(8)));
 
-/* Obsolete: */
-
-struct prio_set {
-       struct bch_csum         csum;
-
-       __le64                  magic;
-       __le32                  nonce[3];
-       __le16                  version;
-       __le16                  flags;
-
-       __u8                    encrypted_start[0];
-
-       __le64                  next_bucket;
-
-       struct bucket_disk {
-               __le16          prio[2];
-               __u8            gen;
-       } __attribute__((packed)) data[];
-} __attribute__((packed, aligned(8)));
-
-LE32_BITMASK(PSET_CSUM_TYPE,   struct prio_set, flags, 0, 4);
-
-#define PSET_MAGIC             __cpu_to_le64(0x6750e15f87337f91ULL)
-
-static inline __u64 __pset_magic(struct bch_sb *sb)
-{
-       return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC);
-}
-
 #endif /* _BCACHEFS_FORMAT_H */
index 6578847b50c4cdf25539fcad05afabb470999f21..73e5d887ccd8bfcc6cf87e8b7800566ac0903209 100644 (file)
@@ -5,6 +5,9 @@
 #include <asm/ioctl.h>
 #include "bcachefs_format.h"
 
+/*
+ * Flags common to multiple ioctls:
+ */
 #define BCH_FORCE_IF_DATA_LOST         (1 << 0)
 #define BCH_FORCE_IF_METADATA_LOST     (1 << 1)
 #define BCH_FORCE_IF_DATA_DEGRADED     (1 << 2)
        (BCH_FORCE_IF_DATA_DEGRADED|            \
         BCH_FORCE_IF_METADATA_DEGRADED)
 
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
 #define BCH_BY_INDEX                   (1 << 4)
 
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
 #define BCH_READ_DEV                   (1 << 5)
 
 /* global control dev: */
 
+/* These are currently broken, and probably unnecessary: */
+#if 0
 #define BCH_IOCTL_ASSEMBLE     _IOW(0xbc, 1, struct bch_ioctl_assemble)
 #define BCH_IOCTL_INCREMENTAL  _IOW(0xbc, 2, struct bch_ioctl_incremental)
 
@@ -35,12 +49,18 @@ struct bch_ioctl_incremental {
        __u64                   pad;
        __u64                   dev;
 };
+#endif
 
 /* filesystem ioctls: */
 
 #define BCH_IOCTL_QUERY_UUID   _IOR(0xbc,      1,  struct bch_ioctl_query_uuid)
+
+/* These only make sense when we also have incremental assembly */
+#if 0
 #define BCH_IOCTL_START                _IOW(0xbc,      2,  struct bch_ioctl_start)
 #define BCH_IOCTL_STOP         _IO(0xbc,       3)
+#endif
+
 #define BCH_IOCTL_DISK_ADD     _IOW(0xbc,      4,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_REMOVE  _IOW(0xbc,      5,  struct bch_ioctl_disk)
 #define BCH_IOCTL_DISK_ONLINE  _IOW(0xbc,      6,  struct bch_ioctl_disk)
@@ -52,14 +72,70 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc,      13,  struct bch_ioctl_disk_get_idx)
 #define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      13,  struct bch_ioctl_disk_resize)
 
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
 struct bch_ioctl_query_uuid {
        uuid_le                 uuid;
 };
 
+#if 0
 struct bch_ioctl_start {
        __u32                   flags;
        __u32                   pad;
 };
+#endif
+
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
 
 struct bch_ioctl_disk {
        __u32                   flags;
@@ -67,6 +143,16 @@ struct bch_ioctl_disk {
        __u64                   dev;
 };
 
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state          - one of the bch_member_state states (rw, ro, failed,
+ *                       spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
 struct bch_ioctl_disk_set_state {
        __u32                   flags;
        __u8                    new_state;
@@ -81,6 +167,15 @@ enum bch_data_ops {
        BCH_DATA_OP_NR          = 3,
 };
 
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
 struct bch_ioctl_data {
        __u32                   op;
        __u32                   flags;
@@ -93,9 +188,18 @@ struct bch_ioctl_data {
                __u32           dev;
                __u32           pad;
        }                       migrate;
+       struct {
+               __u64           pad[8];
+       };
        };
 } __attribute__((packed, aligned(8)));
 
+enum bch_data_event {
+       BCH_DATA_EVENT_PROGRESS = 0,
+       /* XXX: add an event for reporting errors */
+       BCH_DATA_EVENT_NR       = 1,
+};
+
 struct bch_ioctl_data_progress {
        __u8                    data_type;
        __u8                    btree_id;
@@ -106,6 +210,15 @@ struct bch_ioctl_data_progress {
        __u64                   sectors_total;
 } __attribute__((packed, aligned(8)));
 
+struct bch_ioctl_data_event {
+       __u8                    type;
+       __u8                    pad[7];
+       union {
+       struct bch_ioctl_data_progress p;
+       __u64                   pad2[15];
+       };
+} __attribute__((packed, aligned(8)));
+
 struct bch_ioctl_dev_usage {
        __u8                    state;
        __u8                    alive;
@@ -127,6 +240,19 @@ struct bch_ioctl_fs_usage {
        __u64                   sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
 };
 
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices - number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
 struct bch_ioctl_usage {
        __u16                   nr_devices;
        __u16                   pad[3];
@@ -135,6 +261,20 @@ struct bch_ioctl_usage {
        struct bch_ioctl_dev_usage devs[0];
 };
 
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb         - buffer to read into
+ * @size       - size of userspace allocated buffer
+ * @dev                - device to read superblock for, if BCH_READ_DEV flag is
+ *               specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
 struct bch_ioctl_read_super {
        __u32                   flags;
        __u32                   pad;
@@ -143,10 +283,22 @@ struct bch_ioctl_read_super {
        __u64                   sb;
 };
 
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
 struct bch_ioctl_disk_get_idx {
        __u64                   dev;
 };
 
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev                - member to resize
+ * @nbuckets   - new number of buckets
+ */
 struct bch_ioctl_disk_resize {
        __u32                   flags;
        __u32                   pad;
index f665e2e138bf314a136e115125253cf14492a902..2d6c8a230a735c84e05dff7598f6c93bac601f8a 100644 (file)
@@ -13,8 +13,6 @@
 
 void bch2_to_binary(char *, const u64 *, unsigned);
 
-#define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
 /* bkey with split value, const */
 struct bkey_s_c {
        const struct bkey       *k;
@@ -590,25 +588,31 @@ BKEY_VAL_ACCESSORS(quota,         BCH_QUOTA);
 
 /* byte order helpers */
 
-#if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN)
-#error edit for your odd byteorder.
-#endif
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 
-#ifdef __LITTLE_ENDIAN
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+       return f->key_u64s - 1;
+}
 
 #define high_bit_offset                0
-#define __high_word(u64s, k)   ((k)->_data + (u64s) - 1)
 #define nth_word(p, n)         ((p) - (n))
 
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+static inline unsigned high_word_offset(const struct bkey_format *f)
+{
+       return 0;
+}
 
 #define high_bit_offset                KEY_PACKED_BITS_START
-#define __high_word(u64s, k)   ((k)->_data)
 #define nth_word(p, n)         ((p) + (n))
 
+#else
+#error edit for your odd byteorder.
 #endif
 
-#define high_word(format, k)   __high_word((format)->key_u64s, k)
+#define high_word(f, k)                ((k)->_data + high_word_offset(f))
 #define next_word(p)           nth_word(p, 1)
 #define prev_word(p)           nth_word(p, -1)
 
index 92046ae4915c758c72ea42839dce83b11b8b66d5..9a27477409bad42ff5c70e214819f11c9f896d38 100644 (file)
@@ -6,6 +6,7 @@
  */
 
 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "bset.h"
 #include "eytzinger.h"
 #include "util.h"
@@ -438,6 +439,10 @@ void bch2_btree_keys_free(struct btree *b)
        b->aux_data = NULL;
 }
 
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+
 int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp)
 {
        b->page_order   = page_order;
@@ -672,7 +677,7 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
         * (and then the bits we want are at the high end, so we shift them
         * back down):
         */
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        v >>= f->exponent & 7;
 #else
        v >>= 64 - (f->exponent & 7) - (idx < BFLOAT_32BIT_NR ? 32 : 16);
@@ -761,7 +766,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t,
         * Then we calculate the actual shift value, from the start of the key
         * (k->_data), to get the key bits starting at exponent:
         */
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
 
        EBUG_ON(shift + bits > b->format.key_u64s * 64);
@@ -964,10 +969,14 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
        set_btree_bset(b, t, i);
 }
 
-void bch2_bset_init_next(struct btree *b, struct bset *i)
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
+                        struct btree_node_entry *bne)
 {
+       struct bset *i = &bne->keys;
        struct bset_tree *t;
 
+       BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+       BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
        BUG_ON(b->nsets >= MAX_BSETS);
 
        memset(i, 0, sizeof(*i));
index cc4ea5d87e4b3e5a51fbc6bb53d0165886cae4bd..153e2b3f787f8263cfeb3cbe7cd9f2e77fd60f80 100644 (file)
@@ -157,9 +157,6 @@ static inline bool btree_keys_expensive_checks(const struct btree *b)
 #endif
 }
 
-struct btree_node_iter;
-struct btree_node_iter_set;
-
 enum bset_aux_tree_type {
        BSET_NO_AUX_TREE,
        BSET_RO_AUX_TREE,
@@ -342,7 +339,8 @@ int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
 void bch2_btree_keys_init(struct btree *, bool *);
 
 void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct btree *, struct bset *);
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
+                        struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
 void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
                                  struct bkey_packed *);
@@ -420,14 +418,6 @@ static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
 
 /* Btree key iteration */
 
-struct btree_node_iter {
-       u8              is_extents;
-
-       struct btree_node_iter_set {
-               u16     k, end;
-       } data[MAX_BSETS];
-};
-
 static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
                                              bool is_extents)
 {
index 469f856562839dbe4f21c6c9763610832bd7a77f..c950f2564f25d9e907a564fdf07843b1f3b880a6 100644 (file)
@@ -554,7 +554,8 @@ out:
        b->uncompacted_whiteout_u64s = 0;
        bch2_btree_keys_init(b, &c->expensive_debug_checks);
 
-       bch2_time_stats_update(&c->btree_node_mem_alloc_time, start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
+                              start_time);
 
        return b;
 err:
index ad51f29c9a38450f0aac4d917e63978e39b8aaee..cd5ebfbe7bdae3afc4866a56c0f54c15d23cdd96 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/kthread.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
+#include <linux/sched/task.h>
 #include <trace/events/bcachefs.h>
 
 struct range_checks {
@@ -264,10 +265,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 
                gc_pos_set(c, gc_pos_btree_node(b));
 
-               if (max_stale > 32)
+               if (max_stale > 64)
                        bch2_btree_node_rewrite(c, &iter,
                                        b->data->keys.seq,
                                        BTREE_INSERT_USE_RESERVE|
+                                       BTREE_INSERT_NOWAIT|
                                        BTREE_INSERT_GC_LOCK_HELD);
                else if (!btree_gc_rewrite_disabled(c) &&
                         (btree_gc_always_rewrite(c) || max_stale > 16))
@@ -557,7 +559,7 @@ void bch2_gc(struct bch_fs *c)
 out:
        up_write(&c->gc_lock);
        trace_gc_end(c);
-       bch2_time_stats_update(&c->btree_gc_time, start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
        /*
         * Wake up allocator in case it was waiting for buckets
@@ -813,6 +815,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 {
        struct btree_iter iter;
        struct btree *b;
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
        unsigned i;
 
        /* Sliding window of adjacent btree nodes */
@@ -859,7 +862,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 
                lock_seq[0] = merge[0]->lock.state.seq;
 
-               if (test_bit(BCH_FS_GC_STOPPING, &c->flags)) {
+               if (kthread && kthread_should_stop()) {
                        bch2_btree_iter_unlock(&iter);
                        return -ESHUTDOWN;
                }
@@ -958,13 +961,15 @@ static int bch2_gc_thread(void *arg)
 
 void bch2_gc_thread_stop(struct bch_fs *c)
 {
-       set_bit(BCH_FS_GC_STOPPING, &c->flags);
-
-       if (c->gc_thread)
-               kthread_stop(c->gc_thread);
+       struct task_struct *p;
 
+       p = c->gc_thread;
        c->gc_thread = NULL;
-       clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+
+       if (p) {
+               kthread_stop(p);
+               put_task_struct(p);
+       }
 }
 
 int bch2_gc_thread_start(struct bch_fs *c)
@@ -973,12 +978,13 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
        BUG_ON(c->gc_thread);
 
-       p = kthread_create(bch2_gc_thread, c, "bcache_gc");
+       p = kthread_create(bch2_gc_thread, c, "bch_gc");
        if (IS_ERR(p))
                return PTR_ERR(p);
 
+       get_task_struct(p);
        c->gc_thread = p;
-       wake_up_process(c->gc_thread);
+       wake_up_process(p);
        return 0;
 }
 
index 1aa942290e836ef3ccdcfb7db61706a4eb72a4f7..74ffad4c38f3b6b8db912e6bffb059d082de5002 100644 (file)
@@ -627,7 +627,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
 
        if (sorting_entire_node)
-               bch2_time_stats_update(&c->btree_sort_time, start_time);
+               bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+                                      start_time);
 
        /* Make sure we preserve bset journal_seq: */
        for (t = b->set + start_idx; t < b->set + end_idx; t++)
@@ -801,7 +802,7 @@ void bch2_btree_sort_into(struct bch_fs *c,
                                &dst->format,
                                true);
 
-       bch2_time_stats_update(&c->btree_sort_time, start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
 
        set_btree_bset_end(dst, dst->set);
 
@@ -877,7 +878,7 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
 
        bne = want_new_bset(c, b);
        if (bne)
-               bch2_bset_init_next(b, &bne->keys);
+               bch2_bset_init_next(c, b, bne);
 
        bch2_btree_build_aux_trees(b);
 
@@ -1382,7 +1383,7 @@ start:
                }
        }
 
-       bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
        bio_put(&rb->bio);
        clear_btree_node_read_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@@ -1742,6 +1743,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        BUG_ON((b->will_make_reachable != 0) != !b->written);
 
        BUG_ON(b->written >= c->opts.btree_node_size);
+       BUG_ON(b->written & (c->opts.block_size - 1));
        BUG_ON(bset_written(b, btree_bset_last(b)));
        BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
        BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@@ -1972,7 +1974,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
 
        bne = want_new_bset(c, b);
        if (bne)
-               bch2_bset_init_next(b, &bne->keys);
+               bch2_bset_init_next(c, b, bne);
 
        bch2_btree_build_aux_trees(b);
 
index 947685f925b1c881580441b07adb2e6ee48cfbd6..fa154642515128bae9d2dd9645fbf347572f9b72 100644 (file)
@@ -133,7 +133,7 @@ do {                                                                        \
                                                                        \
                six_unlock_read(&(_b)->lock);                           \
                btree_node_wait_on_io(_b);                              \
-               six_lock_read(&(_b)->lock);                             \
+               btree_node_lock_type(c, b, SIX_LOCK_read);              \
        }                                                               \
 } while (0)
 
index 69cad3bb16061d0cddd93164e176fe7caec0f532..70c3132eb538a67f26bc16dfb087339a87b99009 100644 (file)
@@ -42,37 +42,28 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
        six_unlock_write(&b->lock);
 }
 
-void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
 {
+       struct bch_fs *c = iter->c;
        struct btree_iter *linked;
        unsigned readers = 0;
 
-       EBUG_ON(iter->l[b->level].b != b);
-       EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
-
-       if (six_trylock_write(&b->lock))
-               return;
-
        for_each_linked_btree_iter(iter, linked)
                if (linked->l[b->level].b == b &&
                    btree_node_read_locked(linked, b->level))
                        readers++;
 
-       if (likely(!readers)) {
-               six_lock_write(&b->lock);
-       } else {
-               /*
-                * Must drop our read locks before calling six_lock_write() -
-                * six_unlock() won't do wakeups until the reader count
-                * goes to 0, and it's safe because we have the node intent
-                * locked:
-                */
-               atomic64_sub(__SIX_VAL(read_lock, readers),
-                            &b->lock.state.counter);
-               six_lock_write(&b->lock);
-               atomic64_add(__SIX_VAL(read_lock, readers),
-                            &b->lock.state.counter);
-       }
+       /*
+        * Must drop our read locks before calling six_lock_write() -
+        * six_unlock() won't do wakeups until the reader count
+        * goes to 0, and it's safe because we have the node intent
+        * locked:
+        */
+       atomic64_sub(__SIX_VAL(read_lock, readers),
+                    &b->lock.state.counter);
+       btree_node_lock_type(c, b, SIX_LOCK_write);
+       atomic64_add(__SIX_VAL(read_lock, readers),
+                    &b->lock.state.counter);
 }
 
 bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
@@ -135,6 +126,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                           struct btree_iter *iter,
                           enum six_lock_type type)
 {
+       struct bch_fs *c = iter->c;
        struct btree_iter *linked;
 
        /* Can't have children locked before ancestors: */
@@ -206,7 +198,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                }
        }
 
-       six_lock_type(&b->lock, type);
+       __btree_node_lock_type(c, b, type);
        return true;
 }
 
index 95191ba2bc79145274c96cf42bc0105da05c1cfc..0097a2a20a18f119bdd092b03ca555977c89370d 100644 (file)
@@ -4,72 +4,6 @@
 #include <linux/dynamic_fault.h>
 
 #include "btree_types.h"
-#include "bset.h"
-
-#define BTREE_ITER_SLOTS               (1 << 0)
-#define BTREE_ITER_INTENT              (1 << 1)
-#define BTREE_ITER_PREFETCH            (1 << 2)
-/*
- * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
- * @pos or the first key strictly greater than @pos
- */
-#define BTREE_ITER_IS_EXTENTS          (1 << 3)
-/*
- * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
- */
-#define BTREE_ITER_AT_END_OF_LEAF      (1 << 4)
-#define BTREE_ITER_ERROR               (1 << 5)
-
-enum btree_iter_uptodate {
-       BTREE_ITER_UPTODATE             = 0,
-       BTREE_ITER_NEED_PEEK            = 1,
-       BTREE_ITER_NEED_RELOCK          = 2,
-       BTREE_ITER_NEED_TRAVERSE        = 3,
-       BTREE_ITER_END                  = 4,
-};
-
-/*
- * @pos                        - iterator's current position
- * @level              - current btree depth
- * @locks_want         - btree level below which we start taking intent locks
- * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked        - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-       struct bch_fs           *c;
-       struct bpos             pos;
-
-       u8                      flags;
-       unsigned                uptodate:4;
-       enum btree_id           btree_id:4;
-       unsigned                level:4,
-                               locks_want:4,
-                               nodes_locked:4,
-                               nodes_intent_locked:4;
-
-       struct btree_iter_level {
-               struct btree    *b;
-               struct btree_node_iter iter;
-       }                       l[BTREE_MAX_DEPTH];
-
-       u32                     lock_seq[BTREE_MAX_DEPTH];
-
-       /*
-        * Current unpacked key - so that bch2_btree_iter_next()/
-        * bch2_btree_iter_next_slot() can correctly advance pos.
-        */
-       struct bkey             k;
-
-       /*
-        * Circular linked list of linked iterators: linked iterators share
-        * locks (e.g. two linked iterators may have the same node intent
-        * locked, or read and write locked, at the same time), and insertions
-        * through one iterator won't invalidate the other linked iterators.
-        */
-
-       /* Must come last: */
-       struct btree_iter       *next;
-};
 
 static inline void btree_iter_set_dirty(struct btree_iter *iter,
                                        enum btree_iter_uptodate u)
index 0581f44a103e4f0a658bef18f5467eb7022f8651..f48084bc26aec8d9d82f04c18909fb54693af378 100644 (file)
@@ -98,6 +98,39 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
        mark_btree_node_unlocked(iter, level);
 }
 
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
+{
+       switch (type) {
+       case SIX_LOCK_read:
+               return BCH_TIME_btree_lock_contended_read;
+       case SIX_LOCK_intent:
+               return BCH_TIME_btree_lock_contended_intent;
+       case SIX_LOCK_write:
+               return BCH_TIME_btree_lock_contended_write;
+       default:
+               BUG();
+       }
+}
+
+/*
+ * wrapper around six locks that just traces lock contended time
+ */
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
+                                         enum six_lock_type type)
+{
+       u64 start_time = local_clock();
+
+       six_lock_type(&b->lock, type);
+       bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+}
+
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
+                                       enum six_lock_type type)
+{
+       if (!six_trylock_type(&b->lock, type))
+               __btree_node_lock_type(c, b, type);
+}
+
 bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
                           struct btree_iter *, enum six_lock_type);
 
@@ -125,7 +158,17 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
 bool bch2_btree_iter_relock(struct btree_iter *);
 
 void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
-void bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+{
+       EBUG_ON(iter->l[b->level].b != b);
+       EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
+
+       if (!six_trylock_write(&b->lock))
+               __bch2_btree_node_lock_write(b, iter);
+}
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
 
index 8854305d315e8c34c859a9354f6429b32df56dc3..f62c96d96226a651ab578549aec3cbe21293e82c 100644 (file)
@@ -176,6 +176,79 @@ struct btree_cache {
        struct closure_waitlist alloc_wait;
 };
 
+struct btree_node_iter {
+       u8              is_extents;
+
+       struct btree_node_iter_set {
+               u16     k, end;
+       } data[MAX_BSETS];
+};
+
+#define BTREE_ITER_SLOTS               (1 << 0)
+#define BTREE_ITER_INTENT              (1 << 1)
+#define BTREE_ITER_PREFETCH            (1 << 2)
+/*
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
+ * @pos or the first key strictly greater than @pos
+ */
+#define BTREE_ITER_IS_EXTENTS          (1 << 3)
+/*
+ * indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
+ */
+#define BTREE_ITER_AT_END_OF_LEAF      (1 << 4)
+#define BTREE_ITER_ERROR               (1 << 5)
+
+enum btree_iter_uptodate {
+       BTREE_ITER_UPTODATE             = 0,
+       BTREE_ITER_NEED_PEEK            = 1,
+       BTREE_ITER_NEED_RELOCK          = 2,
+       BTREE_ITER_NEED_TRAVERSE        = 3,
+       BTREE_ITER_END                  = 4,
+};
+
+/*
+ * @pos                        - iterator's current position
+ * @level              - current btree depth
+ * @locks_want         - btree level below which we start taking intent locks
+ * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked        - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+       struct bch_fs           *c;
+       struct bpos             pos;
+
+       u8                      flags;
+       unsigned                uptodate:4;
+       enum btree_id           btree_id:4;
+       unsigned                level:4,
+                               locks_want:4,
+                               nodes_locked:4,
+                               nodes_intent_locked:4;
+
+       struct btree_iter_level {
+               struct btree    *b;
+               struct btree_node_iter iter;
+       }                       l[BTREE_MAX_DEPTH];
+
+       u32                     lock_seq[BTREE_MAX_DEPTH];
+
+       /*
+        * Current unpacked key - so that bch2_btree_iter_next()/
+        * bch2_btree_iter_next_slot() can correctly advance pos.
+        */
+       struct bkey             k;
+
+       /*
+        * Circular linked list of linked iterators: linked iterators share
+        * locks (e.g. two linked iterators may have the same node intent
+        * locked, or read and write locked, at the same time), and insertions
+        * through one iterator won't invalidate the other linked iterators.
+        */
+
+       /* Must come last: */
+       struct btree_iter       *next;
+};
+
 #define BTREE_FLAG(flag)                                               \
 static inline bool btree_node_ ## flag(struct btree *b)                        \
 {      return test_bit(BTREE_NODE_ ## flag, &b->flags); }              \
index adba309204eddb87c43af528dfd50cfbcdc369cf..c3ecc1e96726db874cd5503351f825334ba71d1a 100644 (file)
@@ -237,7 +237,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
 
        clear_btree_node_noevict(b);
 
-       six_lock_write(&b->lock);
+       btree_node_lock_type(c, b, SIX_LOCK_write);
 
        bch2_btree_node_hash_remove(&c->btree_cache, b);
 
@@ -622,7 +622,7 @@ static void btree_update_nodes_reachable(struct closure *cl)
                 * b->will_make_reachable prevented it from being written, so
                 * write it now if it needs to be written:
                 */
-               six_lock_read(&b->lock);
+               btree_node_lock_type(c, b, SIX_LOCK_read);
                bch2_btree_node_write_cond(c, b, btree_node_need_write(b));
                six_unlock_read(&b->lock);
                mutex_lock(&c->btree_interior_update_lock);
@@ -647,8 +647,10 @@ static void btree_update_wait_on_journal(struct closure *cl)
        ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
        if (ret < 0)
                goto err;
-       if (!ret)
+       if (!ret) {
                continue_at(cl, btree_update_wait_on_journal, system_wq);
+               return;
+       }
 
        bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
@@ -679,7 +681,7 @@ retry:
 
                if (!six_trylock_read(&b->lock)) {
                        mutex_unlock(&c->btree_interior_update_lock);
-                       six_lock_read(&b->lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
                        six_unlock_read(&b->lock);
                        goto retry;
                }
@@ -720,7 +722,7 @@ retry:
 
                if (!six_trylock_read(&b->lock)) {
                        mutex_unlock(&c->btree_interior_update_lock);
-                       six_lock_read(&b->lock);
+                       btree_node_lock_type(c, b, SIX_LOCK_read);
                        six_unlock_read(&b->lock);
                        goto retry;
                }
@@ -1456,7 +1458,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
                bch2_btree_iter_node_replace(iter, n2);
        bch2_btree_iter_node_replace(iter, n1);
 
-       bch2_time_stats_update(&c->btree_split_time, start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_split], start_time);
 }
 
 static void
@@ -1795,8 +1797,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
        bch2_btree_node_write(c, n, SIX_LOCK_intent);
 
        if (parent) {
-               bch2_btree_insert_node(as, parent, iter,
-                                      &keylist_single(&n->key));
+               bch2_keylist_add(&as->parent_keys, &n->key);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
        } else {
                bch2_btree_set_root(as, n, iter);
        }
index 3e66d69eda1bee57ebc9fdebc28e4c34f373ad3e..25bfc7ab9ee01937b92b9e56e5fa45f496df6ae4 100644 (file)
@@ -226,11 +226,30 @@ static inline bool bset_unwritten(struct btree *b, struct bset *i)
        return (void *) i > write_block(b);
 }
 
-static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
-                                      struct bset *i)
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
+                                                struct btree *b,
+                                                void *end)
 {
-       return round_up(bset_byte_offset(b, vstruct_end(i)),
-                       block_bytes(c)) >> 9;
+       ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
+               b->whiteout_u64s +
+               b->uncompacted_whiteout_u64s;
+       ssize_t total = c->opts.btree_node_size << 6;
+
+       return total - used;
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+                                                  struct btree *b)
+{
+       ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+                               btree_bkey_last(b, bset_tree_last(b)));
+
+       BUG_ON(remaining < 0);
+
+       if (bset_written(b, btree_bset_last(b)))
+               return 0;
+
+       return remaining;
 }
 
 static inline unsigned btree_write_set_buffer(struct btree *b)
@@ -246,20 +265,19 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
                                                     struct btree *b)
 {
        struct bset *i = btree_bset_last(b);
-       unsigned offset = max_t(unsigned, b->written << 9,
-                               bset_byte_offset(b, vstruct_end(i)));
-       ssize_t remaining_space = (ssize_t) btree_bytes(c) - (ssize_t)
-               (offset + sizeof(struct btree_node_entry) +
-                b->whiteout_u64s * sizeof(u64) +
-                b->uncompacted_whiteout_u64s * sizeof(u64));
-
-       EBUG_ON(offset > btree_bytes(c));
-
-       if ((unlikely(bset_written(b, i)) &&
-            remaining_space > block_bytes(c)) ||
-           (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
-            remaining_space > btree_write_set_buffer(b)))
-               return (void *) b->data + offset;
+       struct btree_node_entry *bne = max(write_block(b),
+                       (void *) btree_bkey_last(b, bset_tree_last(b)));
+       ssize_t remaining_space =
+               __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+
+       if (unlikely(bset_written(b, i))) {
+               if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+                       return bne;
+       } else {
+               if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+                   remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
+                       return bne;
+       }
 
        return NULL;
 }
@@ -285,23 +303,6 @@ static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
        }
 }
 
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
-                                                  struct btree *b)
-{
-       struct bset *i = btree_bset_last(b);
-       unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
-               b->whiteout_u64s +
-               b->uncompacted_whiteout_u64s;
-       unsigned total = c->opts.btree_node_size << 6;
-
-       EBUG_ON(used > total);
-
-       if (bset_written(b, i))
-               return 0;
-
-       return total - used;
-}
-
 /*
  * write lock must be held on @b (else the dirty bset that we were going to
  * insert into could be written out from under us)
index 92fb5f61ff14acf2b859576f86a4320b76d4e2d6..cc41140fbe3a59214ab0841e2d512c232c7b28d8 100644 (file)
@@ -108,7 +108,7 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        struct btree_write *w = container_of(pin, struct btree_write, journal);
        struct btree *b = container_of(w, struct btree, writes[i]);
 
-       six_lock_read(&b->lock);
+       btree_node_lock_type(c, b, SIX_LOCK_read);
        bch2_btree_node_write_cond(c, b,
                        (btree_current_write(b) == w &&
                         w->journal.pin_list == journal_seq_pin(j, seq)));
index 1f944cb8a3e5df9d6725cebb7fd3aa722bb639d2..5dda22c73d43eac278d983acd4db8cfe0cdc23a5 100644 (file)
@@ -555,9 +555,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
                return;
        }
 
-       v = READ_ONCE(g->_mark.counter);
+       v = atomic64_read(&g->_mark.v);
        do {
-               new.counter = old.counter = v;
+               new.v.counter = old.v.counter = v;
                saturated = 0;
 
                /*
@@ -600,9 +600,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
                        g->_mark = new;
                        break;
                }
-       } while ((v = cmpxchg(&g->_mark.counter,
-                             old.counter,
-                             new.counter)) != old.counter);
+       } while ((v = atomic64_cmpxchg(&g->_mark.v,
+                             old.v.counter,
+                             new.v.counter)) != old.v.counter);
 
        bch2_dev_usage_update(c, ca, old, new);
 
@@ -957,7 +957,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
        kvpfree(ca->buckets_dirty,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-       kvpfree(ca->buckets,     sizeof(struct bucket_array) +
+       kvpfree(rcu_dereference_protected(ca->buckets, 1),
+               sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
 
        free_percpu(ca->usage_percpu);
index 01f0b314c32243acf75f9fd27212ef0c4ab69975..aefe602744b71e7158d96d007755e0105338d81e 100644 (file)
 
 #define bucket_cmpxchg(g, new, expr)                           \
 ({                                                             \
-       u64 _v = READ_ONCE((g)->_mark.counter);                 \
+       u64 _v = atomic64_read(&(g)->_mark.v);                  \
        struct bucket_mark _old;                                \
                                                                \
        do {                                                    \
-               (new).counter = _old.counter = _v;              \
+               (new).v.counter = _old.v.counter = _v;          \
                expr;                                           \
-       } while ((_v = cmpxchg(&(g)->_mark.counter,             \
-                              _old.counter,                    \
-                              (new).counter)) != _old.counter);\
+       } while ((_v = atomic64_cmpxchg(&(g)->_mark.v,          \
+                              _old.v.counter,                  \
+                              (new).v.counter)) != _old.v.counter);\
        _old;                                                   \
 })
 
index 28bd2c59647796015ac31771e0a7d2b49ddd522c..10f00861385e98ae1bb3184a251843c44b01c0e9 100644 (file)
@@ -6,7 +6,7 @@
 struct bucket_mark {
        union {
        struct {
-               u64             counter;
+               atomic64_t      v;
        };
 
        struct {
index 8403bae64038102bff65ba38ea77f645d98975f9..5593b9a1de27cc7a737cb46a88fd7b204d5f4da9 100644 (file)
@@ -54,6 +54,7 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
        return ca;
 }
 
+#if 0
 static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 {
        struct bch_ioctl_assemble arg;
@@ -127,14 +128,17 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 
        return 0;
 }
+#endif
 
 static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 {
        switch (cmd) {
+#if 0
        case BCH_IOCTL_ASSEMBLE:
                return bch2_ioctl_assemble(arg);
        case BCH_IOCTL_INCREMENTAL:
                return bch2_ioctl_incremental(arg);
+#endif
        default:
                return -ENOTTY;
        }
@@ -148,6 +152,7 @@ static long bch2_ioctl_query_uuid(struct bch_fs *c,
                            sizeof(c->sb.user_uuid));
 }
 
+#if 0
 static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
 {
        if (arg.flags || arg.pad)
@@ -161,6 +166,7 @@ static long bch2_ioctl_stop(struct bch_fs *c)
        bch2_fs_stop(c);
        return 0;
 }
+#endif
 
 static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
 {
@@ -294,18 +300,19 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 {
        struct bch_data_ctx *ctx = file->private_data;
        struct bch_fs *c = ctx->c;
-       struct bch_ioctl_data_progress p = {
-               .data_type      = ctx->stats.data_type,
-               .btree_id       = ctx->stats.iter.btree_id,
-               .pos            = ctx->stats.iter.pos,
-               .sectors_done   = atomic64_read(&ctx->stats.sectors_seen),
-               .sectors_total  = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+       struct bch_ioctl_data_event e = {
+               .type                   = BCH_DATA_EVENT_PROGRESS,
+               .p.data_type            = ctx->stats.data_type,
+               .p.btree_id             = ctx->stats.iter.btree_id,
+               .p.pos                  = ctx->stats.iter.pos,
+               .p.sectors_done         = atomic64_read(&ctx->stats.sectors_seen),
+               .p.sectors_total        = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
        };
 
-       if (len != sizeof(p))
+       if (len < sizeof(e))
                return -EINVAL;
 
-       return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
+       return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
@@ -419,7 +426,7 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 
                if (ca->dev_idx >= arg.nr_devices) {
                        percpu_ref_put(&ca->ref);
-                       return -ENOSPC;
+                       return -ERANGE;
                }
 
                if (percpu_ref_tryget(&ca->io_ref)) {
@@ -539,10 +546,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
                return -EPERM;
 
        switch (cmd) {
+#if 0
        case BCH_IOCTL_START:
                BCH_IOCTL(start, struct bch_ioctl_start);
        case BCH_IOCTL_STOP:
                return bch2_ioctl_stop(c);
+#endif
        case BCH_IOCTL_READ_SUPER:
                BCH_IOCTL(read_super, struct bch_ioctl_read_super);
        case BCH_IOCTL_DISK_GET_IDX:
index 6d8543eb65008e7a67b7f91455d73759654c4b8c..28d086bc0e6124379503078ebd105ba121586244 100644 (file)
@@ -421,7 +421,7 @@ static struct bch_csum bch2_checksum_merge(unsigned type,
        BUG_ON(!bch2_checksum_mergeable(type));
 
        while (b_len) {
-               unsigned b = min(b_len, PAGE_SIZE);
+               unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
 
                a.lo = bch2_checksum_update(type, a.lo,
                                page_address(ZERO_PAGE(0)), b);
index 650be8cebe8fea8ee8f3ac1e9747d0ee47ab8172..c67376f96f5ae635d63840654f4254db6226d5a6 100644 (file)
@@ -42,7 +42,8 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
 }
 
 struct io_clock_wait {
-       struct io_timer         timer;
+       struct io_timer         io_timer;
+       struct timer_list       cpu_timer;
        struct task_struct      *task;
        int                     expired;
 };
@@ -50,7 +51,16 @@ struct io_clock_wait {
 static void io_clock_wait_fn(struct io_timer *timer)
 {
        struct io_clock_wait *wait = container_of(timer,
-                               struct io_clock_wait, timer);
+                               struct io_clock_wait, io_timer);
+
+       wait->expired = 1;
+       wake_up_process(wait->task);
+}
+
+static void io_clock_cpu_timeout(struct timer_list *timer)
+{
+       struct io_clock_wait *wait = container_of(timer,
+                               struct io_clock_wait, cpu_timer);
 
        wait->expired = 1;
        wake_up_process(wait->task);
@@ -61,35 +71,38 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
        struct io_clock_wait wait;
 
        /* XXX: calculate sleep time rigorously */
-       wait.timer.expire       = until;
-       wait.timer.fn           = io_clock_wait_fn;
+       wait.io_timer.expire    = until;
+       wait.io_timer.fn        = io_clock_wait_fn;
        wait.task               = current;
        wait.expired            = 0;
-       bch2_io_timer_add(clock, &wait.timer);
+       bch2_io_timer_add(clock, &wait.io_timer);
 
        schedule();
 
-       bch2_io_timer_del(clock, &wait.timer);
+       bch2_io_timer_del(clock, &wait.io_timer);
 }
 
-/*
- * _only_ to be used from a kthread
- */
 void bch2_kthread_io_clock_wait(struct io_clock *clock,
-                              unsigned long until)
+                               unsigned long io_until,
+                               unsigned long cpu_timeout)
 {
+       bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct io_clock_wait wait;
 
-       /* XXX: calculate sleep time rigorously */
-       wait.timer.expire       = until;
-       wait.timer.fn           = io_clock_wait_fn;
+       wait.io_timer.expire    = io_until;
+       wait.io_timer.fn        = io_clock_wait_fn;
        wait.task               = current;
        wait.expired            = 0;
-       bch2_io_timer_add(clock, &wait.timer);
+       bch2_io_timer_add(clock, &wait.io_timer);
+
+       timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
+
+       if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
+               mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
 
        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
-               if (kthread_should_stop())
+               if (kthread && kthread_should_stop())
                        break;
 
                if (wait.expired)
@@ -100,7 +113,9 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
        }
 
        __set_current_state(TASK_RUNNING);
-       bch2_io_timer_del(clock, &wait.timer);
+       del_singleshot_timer_sync(&wait.cpu_timer);
+       destroy_timer_on_stack(&wait.cpu_timer);
+       bch2_io_timer_del(clock, &wait.io_timer);
 }
 
 static struct io_timer *get_expired_timer(struct io_clock *clock,
index af6b2b392cac257e38e2f2e19a68f5da29f2a760..1e2a7dea4ddd0a0fe5c7c35695c502aa662ecb4b 100644 (file)
@@ -3,7 +3,8 @@
 
 void bch2_io_timer_add(struct io_clock *, struct io_timer *);
 void bch2_io_timer_del(struct io_clock *, struct io_timer *);
-void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+                               unsigned long);
 void bch2_increment_clock(struct bch_fs *, unsigned, int);
 
 void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
index 1af62621da1b070d71af7ce3a6b09ec6eff48b8b..6379905bad7b4ee341e9fea7244a2c94b28dc8f7 100644 (file)
@@ -480,7 +480,7 @@ static const unsigned bch2_compression_opt_to_feature[] = {
 
 #undef BCH_FEATURE_NONE
 
-int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
        int ret = 0;
 
@@ -529,26 +529,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
        mempool_exit(&c->compression_bounce[READ]);
 }
 
-static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
-{
-       size_t size = (size_t)pool_data;
-       return kvpmalloc(size, gfp_mask);
-}
-
-void mempool_kvpfree(void *element, void *pool_data)
-{
-       size_t size = (size_t)pool_data;
-       kvpfree(element, size);
-}
-
-static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-       return !mempool_initialized(pool)
-               ? mempool_init(pool, min_nr, mempool_kvpmalloc,
-                              mempool_kvpfree, (void *) size)
-               : 0;
-}
-
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
        size_t max_extent = c->sb.encoded_extent_max << 9;
@@ -611,6 +591,9 @@ have_compressed:
                if (i->decompress_workspace)
                        decompress_workspace_needed = true;
 
+               if (mempool_initialized(&c->compress_workspace[i->type]))
+                       continue;
+
                ret = mempool_init_kvpmalloc_pool(
                                &c->compress_workspace[i->type],
                                1, i->compress_workspace);
index c129a33eb7590cb1909144344099c57501df655c..cd200cbed3e6ec9fef93b2092b1409a8c3e823d1 100644 (file)
@@ -16,8 +16,8 @@ static int group_cmp(const void *_l, const void *_r)
                strncmp(l->label, r->label, sizeof(l->label));
 }
 
-const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
-                                        struct bch_sb_field *f)
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+                                               struct bch_sb_field *f)
 {
        struct bch_sb_field_disk_groups *groups =
                field_to_type(f, disk_groups);
@@ -162,7 +162,8 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
                }
        }
 
-       old_g = c->disk_groups;
+       old_g = rcu_dereference_protected(c->disk_groups,
+                               lockdep_is_held(&c->sb_lock));
        rcu_assign_pointer(c->disk_groups, cpu_g);
        if (old_g)
                kfree_rcu(old_g, rcu);
@@ -193,6 +194,36 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
        }
 }
 
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return false;
+       case TARGET_DEV:
+               return dev == t.dev;
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g;
+               const struct bch_devs_mask *m;
+               bool ret;
+
+               rcu_read_lock();
+               g = rcu_dereference(c->disk_groups);
+               m = t.group < g->nr && !g->entries[t.group].deleted
+                       ? &g->entries[t.group].devs
+                       : NULL;
+
+               ret = m ? test_bit(dev, m->d) : false;
+               rcu_read_unlock();
+
+               return ret;
+       }
+       default:
+               BUG();
+       }
+}
+
 static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
                                  unsigned parent,
                                  const char *name, unsigned namelen)
index 9da9805af91c2931aadc7b4e5f08805a23bf38d8..e92c0dc50970b50df1966b0afd0f11d31becb880 100644 (file)
@@ -53,34 +53,8 @@ static inline struct target target_decode(unsigned target)
        return (struct target) { .type = TARGET_NULL };
 }
 
-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
-       struct target t = target_decode(target);
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return false;
-       case TARGET_DEV:
-               return ca->dev_idx == t.dev;
-       case TARGET_GROUP:
-               return ca->mi.group && ca->mi.group - 1 == t.group;
-       default:
-               BUG();
-       }
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-       bool ret;
-
-       rcu_read_lock();
-       ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
-       rcu_read_unlock();
-
-       return ret;
-}
-
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
index 9efaa1ffa92f9402085c03571977407611618946..b85af711b9f92ed9aec3bcb51490ae5a87ab61ea 100644 (file)
@@ -144,7 +144,7 @@ bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group
        const struct bch_extent_ptr *ptr;
 
        extent_for_each_ptr(e, ptr) {
-               struct bch_dev *ca = c->devs[ptr->dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
                if (ca->mi.group &&
                    ca->mi.group - 1 == group)
@@ -159,13 +159,11 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
 {
        const struct bch_extent_ptr *ptr;
 
-       extent_for_each_ptr(e, ptr) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-               if (dev_in_target(ca, target) &&
-                   (!ptr->cached || !ptr_stale(ca, ptr)))
+       extent_for_each_ptr(e, ptr)
+               if (bch2_dev_in_target(c, ptr->dev, target) &&
+                   (!ptr->cached ||
+                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
                        return ptr;
-       }
 
        return NULL;
 }
@@ -732,7 +730,7 @@ err:
        bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
                      "gen %i mark %08x",
                      err, buf, PTR_BUCKET_NR(ca, ptr),
-                     mark.gen, (unsigned) mark.counter);
+                     mark.gen, (unsigned) mark.v.counter);
 }
 
 void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
@@ -2024,7 +2022,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                        int n = bch2_extent_ptr_durability(c, ptr);
 
                        if (n && n <= extra &&
-                           !dev_in_target(c->devs[ptr->dev], target)) {
+                           !bch2_dev_in_target(c, ptr->dev, target)) {
                                ptr->cached = true;
                                extra -= n;
                        }
index 338e9e01cf5d9981b94c1fa2aa17cd242df7acce..08ad9647240616749830c1fae7a7f19812086523 100644 (file)
@@ -278,24 +278,38 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
                        .uncompressed_size      = k->size,
                        .live_size              = k->size,
                };
-       case BCH_EXTENT_CRC32:
-               return (struct bch_extent_crc_unpacked) {
+       case BCH_EXTENT_CRC32: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
                        common_fields(crc->crc32),
-                       .csum.lo                = (__force __le64) crc->crc32.csum,
                };
-       case BCH_EXTENT_CRC64:
-               return (struct bch_extent_crc_unpacked) {
+
+               *((__le32 *) &ret.csum.lo) = crc->crc32.csum;
+
+               memcpy(&ret.csum.lo, &crc->crc32.csum,
+                      sizeof(crc->crc32.csum));
+
+               return ret;
+       }
+       case BCH_EXTENT_CRC64: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
                        common_fields(crc->crc64),
                        .nonce                  = crc->crc64.nonce,
                        .csum.lo                = (__force __le64) crc->crc64.csum_lo,
-                       .csum.hi                = (__force __le64) crc->crc64.csum_hi,
                };
-       case BCH_EXTENT_CRC128:
-               return (struct bch_extent_crc_unpacked) {
+
+               *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
+
+               return ret;
+       }
+       case BCH_EXTENT_CRC128: {
+               struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
                        common_fields(crc->crc128),
                        .nonce                  = crc->crc128.nonce,
                        .csum                   = crc->crc128.csum,
                };
+
+               return ret;
+       }
        default:
                BUG();
        }
index a2455b42d9c0bec38083ecece0ee6d1be60f4e5b..1d9464af1db0dc2ba193f475aa4de77fc79fd5ea 100644 (file)
@@ -678,7 +678,7 @@ static void bch2_clear_page_bits(struct page *page)
        if (!PagePrivate(page))
                return;
 
-       s = xchg(page_state(page), (struct bch_page_state) { .v = 0 });
+       s.v = xchg(&page_state(page)->v, 0);
        ClearPagePrivate(page);
 
        if (s.dirty_sectors)
@@ -1020,12 +1020,12 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 
                        if (bkey_extent_is_data(k.k)) {
                                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-                               const struct bch_extent_ptr *ptr;
                                struct bch_extent_crc_unpacked crc;
+                               const union bch_extent_entry *i;
 
-                               extent_for_each_ptr_crc(e, ptr, crc)
-                                       want_full_extent |= !!crc.csum_type |
-                                                            !!crc.compression_type;
+                               extent_for_each_crc(e, crc, i)
+                                       want_full_extent |= ((crc.csum_type != 0) |
+                                                            (crc.compression_type != 0));
                        }
 
                        readpage_bio_extend(readpages_iter,
@@ -1850,8 +1850,7 @@ err_wait_io:
                dio->loop = true;
 
                if (!dio->sync) {
-                       continue_at_noreturn(&dio->cl,
-                                       bch2_dio_write_loop_async, NULL);
+                       continue_at(&dio->cl, bch2_dio_write_loop_async, NULL);
                        return -EIOCBQUEUED;
                }
 
index 2991a0dd3830bee2df923f9aabb56c1b460d3cca..c554a987f3aa0be228d655587a9ebba33d52005f 100644 (file)
@@ -610,9 +610,10 @@ static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
 static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
 {
        if (nr >= b->size) {
-               size_t new_size = max(max(PAGE_SIZE * 8,
-                                         b->size * 2),
-                                         nr + 1);
+               size_t new_size = max_t(size_t, max_t(size_t,
+                                       PAGE_SIZE * 8,
+                                       b->size * 2),
+                                       nr + 1);
                void *n;
 
                new_size = roundup_pow_of_two(new_size);
@@ -642,7 +643,7 @@ struct pathbuf {
 static int path_down(struct pathbuf *p, u64 inum)
 {
        if (p->nr == p->size) {
-               size_t new_size = max(256UL, p->size * 2);
+               size_t new_size = max_t(size_t, 256UL, p->size * 2);
                void *n = krealloc(p->entries,
                                   new_size * sizeof(p->entries[0]),
                                   GFP_KERNEL);
index bb6565226846625b63a51de6e5ecc659ea34664d..3762fb92b04f9ee7541eccca4ff752d229a5839c 100644 (file)
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
-#include "tier.h"
 
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -269,7 +269,7 @@ static void bch2_write_done(struct closure *cl)
        percpu_ref_put(&c->writes);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
-       bch2_time_stats_update(&c->data_write_time, op->start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
        closure_return(cl);
 }
@@ -842,20 +842,24 @@ again:
        } while (ret);
 
        continue_at(cl, bch2_write_index, index_update_wq(op));
+       return;
 err:
        op->error = ret;
 
        continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
                    ? bch2_write_index
                    : bch2_write_done, index_update_wq(op));
+       return;
 flush_io:
        closure_sync(cl);
 
        if (!bch2_keylist_empty(&op->insert_keys)) {
                __bch2_write_index(op);
 
-               if (op->error)
+               if (op->error) {
                        continue_at_nobarrier(cl, bch2_write_done, NULL);
+                       return;
+               }
        }
 
        goto again;
@@ -901,6 +905,7 @@ void bch2_write(struct closure *cl)
                if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
                        bch2_disk_reservation_put(c, &op->res);
                closure_return(cl);
+               return;
        }
 
        bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
@@ -974,7 +979,8 @@ static void promote_done(struct closure *cl)
                container_of(cl, struct promote_op, cl);
        struct bch_fs *c = op->write.op.c;
 
-       bch2_time_stats_update(&c->data_promote_time, op->start_time);
+       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+                              op->start_time);
 
        bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
        promote_free(c, op);
@@ -1048,7 +1054,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                (*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
                bch2_bio_map(&(*rbio)->bio, NULL);
 
-               if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+               if (bch2_bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
                        goto err;
 
                (*rbio)->bounce         = true;
@@ -1174,7 +1180,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
-       bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
+       bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+                              rbio->start_time);
        bio_endio(&rbio->bio);
 }
 
@@ -1486,7 +1493,7 @@ csum_err:
        }
 
        bch2_dev_io_error(ca,
-               "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+               "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)",
                rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
                rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
                csum.hi, csum.lo, crc.csum_type);
index 4cec7bb56948143e0d06ff366ebef69d07d67a2a..6759810b19ef5dedc9ddb5fa64d153cc6bcafa77 100644 (file)
@@ -365,6 +365,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);
 
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
+                               unsigned nr);
 int bch2_dev_journal_alloc(struct bch_dev *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
index 2fd0d6468e3d1979bcdf3ee06edab509eda76a59..36ba6a4daf84097953cf5f84983bd45dab97bfb0 100644 (file)
@@ -324,7 +324,7 @@ struct jset_entry_ops {
                        struct jset_entry *, int);
 };
 
-const struct jset_entry_ops bch2_jset_entry_ops[] = {
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #define x(f, nr)                                               \
        [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
                .validate       = journal_entry_validate_##f,   \
@@ -696,6 +696,7 @@ out:
        kvpfree(buf.data, buf.size);
        percpu_ref_put(&ca->io_ref);
        closure_return(cl);
+       return;
 err:
        mutex_lock(&jlist->lock);
        jlist->ret = ret;
@@ -716,19 +717,6 @@ void bch2_journal_entries_free(struct list_head *list)
        }
 }
 
-static inline bool journal_has_keys(struct list_head *list)
-{
-       struct journal_replay *i;
-       struct jset_entry *entry;
-       struct bkey_i *k, *_n;
-
-       list_for_each_entry(i, list, list)
-               for_each_jset_key(k, _n, entry, &i->j)
-                       return true;
-
-       return false;
-}
-
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
        struct journal *j = &c->journal;
@@ -737,8 +725,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        struct journal_entry_pin_list *p;
        struct bch_dev *ca;
        u64 cur_seq, end_seq, seq;
-       unsigned iter, keys = 0, entries = 0;
-       size_t nr;
+       unsigned iter;
+       size_t entries = 0;
+       u64 nr, keys = 0;
        bool degraded = false;
        int ret = 0;
 
@@ -772,9 +761,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                return BCH_FSCK_REPAIR_IMPOSSIBLE;
        }
 
-       fsck_err_on(c->sb.clean && journal_has_keys(list), c,
-                   "filesystem marked clean but journal has keys to replay");
-
        list_for_each_entry(i, list, list) {
                ret = jset_validate_entries(c, &i->j, READ);
                if (ret)
@@ -797,15 +783,27 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                }
        }
 
+       list_for_each_entry(i, list, list) {
+               struct jset_entry *entry;
+               struct bkey_i *k, *_n;
+
+               for_each_jset_key(k, _n, entry, &i->j)
+                       keys++;
+       }
+
        i = list_last_entry(list, struct journal_replay, list);
 
        nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
 
+       fsck_err_on(c->sb.clean && (keys || nr > 1), c,
+                   "filesystem marked clean but journal not empty (%llu keys in %llu entries)",
+                   keys, nr);
+
        if (nr > j->pin.size) {
                free_fifo(&j->pin);
                init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
                if (!j->pin.data) {
-                       bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+                       bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
                        return -ENOMEM;
                }
        }
@@ -844,8 +842,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                                struct journal_replay, list)->j.seq);
 
        list_for_each_entry(i, list, list) {
-               struct jset_entry *entry;
-               struct bkey_i *k, *_n;
                bool blacklisted;
 
                mutex_lock(&j->blacklist_lock);
@@ -867,13 +863,10 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                        journal_last_seq(j), end_seq);
 
                cur_seq = le64_to_cpu(i->j.seq) + 1;
-
-               for_each_jset_key(k, _n, entry, &i->j)
-                       keys++;
                entries++;
        }
 
-       bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+       bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
                 keys, entries, journal_cur_seq(j));
 fsck_err:
        return ret;
@@ -1361,6 +1354,7 @@ void bch2_journal_write(struct closure *cl)
                bch_err(c, "Unable to allocate journal write");
                bch2_fatal_error(c);
                continue_at(cl, journal_write_done, system_highpri_wq);
+               return;
        }
 
        /*
@@ -1417,6 +1411,7 @@ no_io:
                ptr->offset += sectors;
 
        continue_at(cl, journal_write_done, system_highpri_wq);
+       return;
 err:
        bch2_inconsistent_error(c);
        continue_at(cl, journal_write_done, system_highpri_wq);
index b5301d96469ee27b978368a853880938a61bebbc..567289e22ca0b73f7f92b02521b9191d381a6128 100644 (file)
@@ -247,7 +247,7 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
        if (!bl->nr_entries ||
            is_power_of_2(bl->nr_entries)) {
                n = krealloc(bl->entries,
-                            max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+                            max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
                             GFP_KERNEL);
                if (!n) {
                        ret = -ENOMEM;
index a8c8883ba0719ae736bcb17370de00d7d18322df..3106759e35f7e96c6391cff63d7ea7a16d9d9381 100644 (file)
@@ -55,9 +55,6 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
             _k != (_keylist)->top;                             \
             _k = bkey_next(_k))
 
-#define keylist_single(k)                                      \
-       ((struct keylist) { .keys = k, .top = bkey_next(k) })
-
 static inline u64 keylist_sectors(struct keylist *keys)
 {
        struct bkey_i *k;
index 0431fb81078feff85d196b451aefb0973ddf6794..3e52b7a26c7f51e5e3c37ec73da41885fa93fcb9 100644 (file)
@@ -306,16 +306,16 @@ static void move_write(struct closure *cl)
 {
        struct moving_io *io = container_of(cl, struct moving_io, cl);
 
-       if (likely(!io->rbio.bio.bi_status &&
-                  !io->rbio.hole)) {
-               bch2_migrate_read_done(&io->write, &io->rbio);
-
-               atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
-               closure_call(&io->write.op.cl, bch2_write, NULL, cl);
-               continue_at(cl, move_write_done, NULL);
+       if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+               closure_return_with_destructor(cl, move_free);
+               return;
        }
 
-       closure_return_with_destructor(cl, move_free);
+       bch2_migrate_read_done(&io->write, &io->rbio);
+
+       atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
+       closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+       continue_at(cl, move_write_done, NULL);
 }
 
 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -411,7 +411,7 @@ static int bch2_move_extent(struct bch_fs *c,
        io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
 
        bch2_bio_map(&io->write.op.wbio.bio, NULL);
-       if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+       if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
                goto err_free;
 
        io->rbio.opts = io_opts;
index bc98f94bb23d8b267974f864b6a80d64411e2c73..bc87e0670d92edbfeb25c8585c0467ef179ab5ae 100644 (file)
@@ -4,6 +4,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "io_types.h"
+#include "move_types.h"
 
 struct bch_read_bio;
 struct moving_context;
@@ -48,16 +49,6 @@ typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
                                enum bkey_type, struct bkey_s_c_extent,
                                struct bch_io_opts *, struct data_opts *);
 
-struct bch_move_stats {
-       enum bch_data_type      data_type;
-       struct btree_iter       iter;
-
-       atomic64_t              keys_moved;
-       atomic64_t              sectors_moved;
-       atomic64_t              sectors_seen;
-       atomic64_t              sectors_raced;
-};
-
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
                   struct write_point_specifier,
                   struct bpos, struct bpos,
diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h
new file mode 100644 (file)
index 0000000..832542a
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _BCACHEFS_MOVE_TYPES_H
+#define _BCACHEFS_MOVE_TYPES_H
+
+struct bch_move_stats {
+       enum bch_data_type      data_type;
+       struct btree_iter       iter;
+
+       atomic64_t              keys_moved;
+       atomic64_t              sectors_moved;
+       atomic64_t              sectors_seen;
+       atomic64_t              sectors_raced;
+};
+
+#endif /* _BCACHEFS_MOVE_TYPES_H */
index 28dabca74565e3046ad396f03ffbf70ff48e4d4c..7bef456110f1e1132ce812e2a15e3c8cc8940e85 100644 (file)
@@ -241,7 +241,8 @@ static int bch2_copygc_thread(void *arg)
                        ca->mi.bucket_size;
                if (available > reserve) {
                        next = last + available - reserve;
-                       bch2_kthread_io_clock_wait(clock, next);
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
                        continue;
                }
 
@@ -252,7 +253,8 @@ static int bch2_copygc_thread(void *arg)
                fragmented = usage.sectors_fragmented;
                if (fragmented < reserve) {
                        next = last + reserve - fragmented;
-                       bch2_kthread_io_clock_wait(clock, next);
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
                        continue;
                }
 
diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c
new file mode 100644 (file)
index 0000000..4154b1e
--- /dev/null
@@ -0,0 +1,341 @@
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_iter.h"
+#include "buckets.h"
+#include "clock.h"
+#include "disk_groups.h"
+#include "extents.h"
+#include "io.h"
+#include "move.h"
+#include "rebalance.h"
+#include "super-io.h"
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/sched/cputime.h>
+#include <trace/events/bcachefs.h>
+
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+                                     const struct bch_extent_ptr *ptr,
+                                     struct bch_extent_crc_unpacked crc,
+                                     struct bch_io_opts *io_opts)
+{
+       if (io_opts->background_target &&
+           !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
+           !ptr->cached)
+               return true;
+
+       if (io_opts->background_compression &&
+           crc.compression_type !=
+           bch2_compression_opt_to_type[io_opts->background_compression])
+               return true;
+
+       return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+                           struct bkey_s_c k,
+                           struct bch_io_opts *io_opts)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       struct bkey_s_c_extent e;
+
+       if (!bkey_extent_is_data(k.k))
+               return;
+
+       if (!io_opts->background_target &&
+           !io_opts->background_compression)
+               return;
+
+       e = bkey_s_c_to_extent(k);
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+                       if (atomic64_add_return(crc.compressed_size,
+                                               &ca->rebalance_work) ==
+                           crc.compressed_size)
+                               rebalance_wakeup(c);
+               }
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+       if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+           sectors)
+               rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+                                   enum bkey_type type,
+                                   struct bkey_s_c_extent e,
+                                   struct bch_io_opts *io_opts,
+                                   struct data_opts *data_opts)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+
+       /* Make sure we have room to add a new pointer: */
+       if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+           BKEY_EXTENT_VAL_U64s_MAX)
+               return DATA_SKIP;
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+                       goto found;
+
+       return DATA_SKIP;
+found:
+       data_opts->target               = io_opts->background_target;
+       data_opts->btree_insert_flags   = 0;
+       return DATA_ADD_REPLICAS;
+}
+
+struct rebalance_work {
+       int             dev_most_full_idx;
+       unsigned        dev_most_full_percent;
+       u64             dev_most_full_work;
+       u64             dev_most_full_capacity;
+       u64             total_work;
+};
+
+static void rebalance_work_accumulate(struct rebalance_work *w,
+               u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+{
+       unsigned percent_full;
+       u64 work = dev_work + unknown_dev;
+
+       if (work < dev_work || work < unknown_dev)
+               work = U64_MAX;
+       work = min(work, capacity);
+
+       percent_full = div_u64(work * 100, capacity);
+
+       if (percent_full >= w->dev_most_full_percent) {
+               w->dev_most_full_idx            = idx;
+               w->dev_most_full_percent        = percent_full;
+               w->dev_most_full_work           = work;
+               w->dev_most_full_capacity       = capacity;
+       }
+
+       if (w->total_work + dev_work >= w->total_work &&
+           w->total_work + dev_work >= dev_work)
+               w->total_work += dev_work;
+}
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       struct rebalance_work ret = { .dev_most_full_idx = -1 };
+       u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               rebalance_work_accumulate(&ret,
+                       atomic64_read(&ca->rebalance_work),
+                       unknown_dev,
+                       bucket_to_sector(ca, ca->mi.nbuckets -
+                                        ca->mi.first_bucket),
+                       i);
+
+       rebalance_work_accumulate(&ret,
+               unknown_dev, 0, c->capacity, -1);
+
+       return ret;
+}
+
+static void rebalance_work_reset(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               atomic64_set(&ca->rebalance_work, 0);
+
+       atomic64_set(&c->rebalance.work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+       u64 utime, stime;
+
+       task_cputime_adjusted(current, &utime, &stime);
+       return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+       struct bch_fs *c = arg;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct io_clock *clock = &c->io_clock[WRITE];
+       struct rebalance_work w, p;
+       unsigned long start, prev_start;
+       unsigned long prev_run_time, prev_run_cputime;
+       unsigned long cputime, prev_cputime;
+       unsigned long io_start;
+       long throttle;
+
+       set_freezable();
+
+       io_start        = atomic_long_read(&clock->now);
+       p               = rebalance_work(c);
+       prev_start      = jiffies;
+       prev_cputime    = curr_cputime();
+
+       while (!kthread_wait_freezable(r->enabled)) {
+               start                   = jiffies;
+               cputime                 = curr_cputime();
+
+               prev_run_time           = start - prev_start;
+               prev_run_cputime        = cputime - prev_cputime;
+
+               w                       = rebalance_work(c);
+               BUG_ON(!w.dev_most_full_capacity);
+
+               if (!w.total_work) {
+                       r->state = REBALANCE_WAITING;
+                       kthread_wait_freezable(rebalance_work(c).total_work);
+                       continue;
+               }
+
+               /*
+                * If there isn't much work to do, throttle cpu usage:
+                */
+               throttle = prev_run_cputime * 100 /
+                       max(1U, w.dev_most_full_percent) -
+                       prev_run_time;
+
+               if (w.dev_most_full_percent < 20 && throttle > 0) {
+                       r->state = REBALANCE_THROTTLED;
+                       r->throttled_until_iotime = io_start +
+                               div_u64(w.dev_most_full_capacity *
+                                       (20 - w.dev_most_full_percent),
+                                       50);
+                       r->throttled_until_cputime = start + throttle;
+
+                       bch2_kthread_io_clock_wait(clock,
+                               r->throttled_until_iotime,
+                               throttle);
+                       continue;
+               }
+
+               /* minimum 1 mb/sec: */
+               r->pd.rate.rate =
+                       max_t(u64, 1 << 11,
+                             r->pd.rate.rate *
+                             max(p.dev_most_full_percent, 1U) /
+                             max(w.dev_most_full_percent, 1U));
+
+               io_start        = atomic_long_read(&clock->now);
+               p               = w;
+               prev_start      = start;
+               prev_cputime    = cputime;
+
+               r->state = REBALANCE_RUNNING;
+               memset(&r->move_stats, 0, sizeof(r->move_stats));
+               rebalance_work_reset(c);
+
+               bch2_move_data(c,
+                              /* ratelimiting disabled for now */
+                              NULL, /*  &r->pd.rate, */
+                              writepoint_ptr(&c->rebalance_write_point),
+                              POS_MIN, POS_MAX,
+                              rebalance_pred, NULL,
+                              &r->move_stats);
+       }
+
+       return 0;
+}
+
+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
+{
+       char *out = buf, *end = out + PAGE_SIZE;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct rebalance_work w = rebalance_work(c);
+       char h1[21], h2[21];
+
+       bch2_hprint(h1, w.dev_most_full_work << 9);
+       bch2_hprint(h2, w.dev_most_full_capacity << 9);
+       out += scnprintf(out, end - out,
+                        "fullest_dev (%i):\t%s/%s\n",
+                        w.dev_most_full_idx, h1, h2);
+
+       bch2_hprint(h1, w.total_work << 9);
+       bch2_hprint(h2, c->capacity << 9);
+       out += scnprintf(out, end - out,
+                        "total work:\t\t%s/%s\n",
+                        h1, h2);
+
+       out += scnprintf(out, end - out,
+                        "rate:\t\t\t%u\n",
+                        r->pd.rate.rate);
+
+       switch (r->state) {
+       case REBALANCE_WAITING:
+               out += scnprintf(out, end - out, "waiting\n");
+               break;
+       case REBALANCE_THROTTLED:
+               bch2_hprint(h1,
+                           (r->throttled_until_iotime -
+                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+               out += scnprintf(out, end - out,
+                                "throttled for %lu sec or %s io\n",
+                                (r->throttled_until_cputime - jiffies) / HZ,
+                                h1);
+               break;
+       case REBALANCE_RUNNING:
+               out += scnprintf(out, end - out, "running\n");
+               out += scnprintf(out, end - out, "pos %llu:%llu\n",
+                                r->move_stats.iter.pos.inode,
+                                r->move_stats.iter.pos.offset);
+               break;
+       }
+
+       return out - buf;
+}
+
+void bch2_rebalance_stop(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       c->rebalance.pd.rate.rate = UINT_MAX;
+       bch2_ratelimit_reset(&c->rebalance.pd.rate);
+
+       p = rcu_dereference_protected(c->rebalance.thread, 1);
+       c->rebalance.thread = NULL;
+
+       if (p) {
+               /* for sychronizing with rebalance_wakeup() */
+               synchronize_rcu();
+
+               kthread_stop(p);
+               put_task_struct(p);
+       }
+}
+
+int bch2_rebalance_start(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       if (c->opts.nochanges)
+               return 0;
+
+       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       get_task_struct(p);
+       rcu_assign_pointer(c->rebalance.thread, p);
+       wake_up_process(p);
+       return 0;
+}
+
+void bch2_fs_rebalance_init(struct bch_fs *c)
+{
+       bch2_pd_controller_init(&c->rebalance.pd);
+
+       atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
+}
similarity index 65%
rename from libbcachefs/tier.h
rename to libbcachefs/rebalance.h
index 0c66dfea7c0dd24974cf127d24059d647b0677ed..2e6aa67724710fa603c8ee81177e347f04f6f1de 100644 (file)
@@ -1,12 +1,14 @@
-#ifndef _BCACHEFS_TIER_H
-#define _BCACHEFS_TIER_H
+#ifndef _BCACHEFS_REBALANCE_H
+#define _BCACHEFS_REBALANCE_H
+
+#include "rebalance_types.h"
 
 static inline void rebalance_wakeup(struct bch_fs *c)
 {
        struct task_struct *p;
 
        rcu_read_lock();
-       p = rcu_dereference(c->rebalance_thread);
+       p = rcu_dereference(c->rebalance.thread);
        if (p)
                wake_up_process(p);
        rcu_read_unlock();
@@ -16,8 +18,10 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
                            struct bch_io_opts *);
 void bch2_rebalance_add_work(struct bch_fs *, u64);
 
+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
+
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
 void bch2_fs_rebalance_init(struct bch_fs *);
 
-#endif /* _BCACHEFS_TIER_H */
+#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h
new file mode 100644 (file)
index 0000000..aaf5b9c
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
+#define _BCACHEFS_REBALANCE_TYPES_H
+
+#include "move_types.h"
+
+enum rebalance_state {
+       REBALANCE_WAITING,
+       REBALANCE_THROTTLED,
+       REBALANCE_RUNNING,
+};
+
+struct bch_fs_rebalance {
+       struct task_struct __rcu *thread;
+       struct bch_pd_controller pd;
+
+       atomic64_t              work_unknown_dev;
+
+       enum rebalance_state    state;
+       unsigned long           throttled_until_iotime;
+       unsigned long           throttled_until_cputime;
+       struct bch_move_stats   move_stats;
+
+       unsigned                enabled:1;
+};
+
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
index f0ff8d41923c40fcc234e19324bd6e28293dd0b4..afa59a476a704d81679635613fad24f1d49519da 100644 (file)
@@ -146,6 +146,8 @@ struct six_lock_waiter {
 /* This is probably up there with the more evil things I've done */
 #define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
 
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
 static inline int six_can_spin_on_owner(struct six_lock *lock)
 {
        struct task_struct *owner;
@@ -257,6 +259,15 @@ fail:
        return false;
 }
 
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+       return false;
+}
+
+#endif
+
 noinline
 static void __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type)
 {
index a2b981a3c9c50db224a59326595bba7d453b0fdb..9772d59730781e07ba2d8ec9bb776b3d363191e1 100644 (file)
@@ -624,7 +624,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
        bio_set_dev(bio, ca->disk_sb.bdev);
        bio->bi_iter.bi_sector  = le64_to_cpu(sb->offset);
        bio->bi_iter.bi_size    =
-               roundup(vstruct_bytes(sb),
+               roundup((size_t) vstruct_bytes(sb),
                        bdev_logical_block_size(ca->disk_sb.bdev));
        bio->bi_end_io          = write_super_endio;
        bio->bi_private         = ca;
index f407c205b93b4c4bc95018877f4d8581de588296..995b1c907318c823b009b3bbf48e45059a8c10af 100644 (file)
@@ -73,11 +73,6 @@ static inline __u64 jset_magic(struct bch_fs *c)
        return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
 }
 
-static inline __u64 pset_magic(struct bch_fs *c)
-{
-       return __le64_to_cpu(bch2_sb_magic(c) ^ PSET_MAGIC);
-}
-
 static inline __u64 bset_magic(struct bch_fs *c)
 {
        return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
@@ -136,4 +131,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
        };
 }
 
+size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
+                            struct bch_sb_field *);
+
 #endif /* _BCACHEFS_SUPER_IO_H */
index 16b8cbfc973a389ced7393b5a7614bec13156b4a..55da242c994dc719e6b770e644943cdb4f4ec75c 100644 (file)
 #include "migrate.h"
 #include "movinggc.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
-#include "tier.h"
 
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
@@ -398,10 +398,10 @@ err:
 
 static void bch2_fs_free(struct bch_fs *c)
 {
-#define BCH_TIME_STAT(name)                            \
-       bch2_time_stats_exit(&c->name##_time);
-       BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+       unsigned i;
+
+       for (i = 0; i < BCH_TIME_STAT_NR; i++)
+               bch2_time_stats_exit(&c->times[i]);
 
        bch2_fs_quota_exit(c);
        bch2_fs_fsio_exit(c);
@@ -565,10 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        init_rwsem(&c->gc_lock);
 
-#define BCH_TIME_STAT(name)                            \
-       bch2_time_stats_init(&c->name##_time);
-       BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+       for (i = 0; i < BCH_TIME_STAT_NR; i++)
+               bch2_time_stats_init(&c->times[i]);
 
        bch2_fs_allocator_init(c);
        bch2_fs_rebalance_init(c);
@@ -592,14 +590,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        seqcount_init(&c->gc_pos_lock);
 
        c->copy_gc_enabled              = 1;
-       c->rebalance_enabled            = 1;
-       c->rebalance_percent            = 10;
+       c->rebalance.enabled            = 1;
        c->promote_whole_extents        = true;
 
-       c->journal.write_time   = &c->journal_write_time;
-       c->journal.delay_time   = &c->journal_delay_time;
-       c->journal.blocked_time = &c->journal_blocked_time;
-       c->journal.flush_seq_time = &c->journal_flush_seq_time;
+       c->journal.write_time   = &c->times[BCH_TIME_journal_write];
+       c->journal.delay_time   = &c->times[BCH_TIME_journal_delay];
+       c->journal.blocked_time = &c->times[BCH_TIME_journal_blocked];
+       c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
 
        bch2_fs_btree_cache_init_early(&c->btree_cache);
 
@@ -647,7 +644,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        BIOSET_NEED_BVECS) ||
            !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
            lg_lock_init(&c->usage_lock) ||
-           mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
+           mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
+                                       btree_bytes(c)) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
index 65345d80e4f4a813afb94b55e0afcad06c8d31e0..5e341a712cdf65dcd91483450f2977771ceb5f69 100644 (file)
@@ -24,9 +24,9 @@
 #include "keylist.h"
 #include "move.h"
 #include "opts.h"
+#include "rebalance.h"
 #include "replicas.h"
 #include "super-io.h"
-#include "tier.h"
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
@@ -183,8 +183,8 @@ rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
 
 rw_attribute(rebalance_enabled);
-rw_attribute(rebalance_percent);
 sysfs_pd_controller_attribute(rebalance);
+read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);
 
 rw_attribute(pd_controllers_update_seconds);
@@ -198,11 +198,11 @@ read_attribute(data_replicas_have);
        BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-#define BCH_TIME_STAT(_name)                                           \
+#define x(_name)                                               \
        static struct attribute sysfs_time_stat_##_name =               \
                { .name = #_name, .mode = S_IRUGO };
        BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
 
 static struct attribute sysfs_state_rw = {
        .name = "state",
@@ -340,9 +340,11 @@ SHOW(bch2_fs)
        sysfs_print(pd_controllers_update_seconds,
                    c->pd_controllers_update_seconds);
 
-       sysfs_printf(rebalance_enabled,         "%i", c->rebalance_enabled);
-       sysfs_print(rebalance_percent,          c->rebalance_percent);
-       sysfs_pd_controller_show(rebalance,     &c->rebalance_pd); /* XXX */
+       sysfs_printf(rebalance_enabled,         "%i", c->rebalance.enabled);
+       sysfs_pd_controller_show(rebalance,     &c->rebalance.pd); /* XXX */
+
+       if (attr == &sysfs_rebalance_work)
+               return bch2_rebalance_work_show(c, buf);
 
        sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 
@@ -404,7 +406,7 @@ STORE(__bch2_fs)
        }
 
        if (attr == &sysfs_rebalance_enabled) {
-               ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
+               ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
                        ?: (ssize_t) size;
 
                rebalance_wakeup(c);
@@ -413,9 +415,7 @@ STORE(__bch2_fs)
 
        sysfs_strtoul(pd_controllers_update_seconds,
                      c->pd_controllers_update_seconds);
-
-       sysfs_strtoul(rebalance_percent,        c->rebalance_percent);
-       sysfs_pd_controller_store(rebalance,    &c->rebalance_pd);
+       sysfs_pd_controller_store(rebalance,    &c->rebalance.pd);
 
        sysfs_strtoul(promote_whole_extents,    c->promote_whole_extents);
 
@@ -474,7 +474,6 @@ struct attribute *bch2_fs_files[] = {
        &sysfs_journal_write_delay_ms,
        &sysfs_journal_reclaim_delay_ms,
 
-       &sysfs_rebalance_percent,
        &sysfs_promote_whole_extents,
 
        &sysfs_compression_stats,
@@ -513,8 +512,11 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_prune_cache,
 
        &sysfs_copy_gc_enabled,
+
        &sysfs_rebalance_enabled,
+       &sysfs_rebalance_work,
        sysfs_pd_controller_files(rebalance),
+
        &sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -613,11 +615,12 @@ SHOW(bch2_fs_time_stats)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
 
-#define BCH_TIME_STAT(name)                                            \
+#define x(name)                                                \
        if (attr == &sysfs_time_stat_##name)                            \
-               return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
+               return bch2_time_stats_print(&c->times[BCH_TIME_##name],\
+                                            buf, PAGE_SIZE);
        BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
 
        return 0;
 }
@@ -629,10 +632,10 @@ STORE(bch2_fs_time_stats)
 SYSFS_OPS(bch2_fs_time_stats);
 
 struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name)                                            \
+#define x(name)                                                \
        &sysfs_time_stat_##name,
        BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+#undef x
        NULL
 };
 
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
deleted file mode 100644 (file)
index a15a0fa..0000000
+++ /dev/null
@@ -1,259 +0,0 @@
-
-#include "bcachefs.h"
-#include "alloc.h"
-#include "btree_iter.h"
-#include "buckets.h"
-#include "clock.h"
-#include "disk_groups.h"
-#include "extents.h"
-#include "io.h"
-#include "move.h"
-#include "super-io.h"
-#include "tier.h"
-
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/sched/cputime.h>
-#include <trace/events/bcachefs.h>
-
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
-                                     const struct bch_extent_ptr *ptr,
-                                     struct bch_extent_crc_unpacked crc,
-                                     struct bch_io_opts *io_opts)
-{
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-       if (io_opts->background_target &&
-           !dev_in_target(ca, io_opts->background_target) &&
-           !ptr->cached)
-               return true;
-
-       if (io_opts->background_compression &&
-           crc.compression_type !=
-           bch2_compression_opt_to_type[io_opts->background_compression])
-               return true;
-
-       return false;
-}
-
-void bch2_rebalance_add_key(struct bch_fs *c,
-                           struct bkey_s_c k,
-                           struct bch_io_opts *io_opts)
-{
-       const struct bch_extent_ptr *ptr;
-       struct bch_extent_crc_unpacked crc;
-       struct bkey_s_c_extent e;
-
-       if (!bkey_extent_is_data(k.k))
-               return;
-
-       if (!io_opts->background_target &&
-           !io_opts->background_compression)
-               return;
-
-       e = bkey_s_c_to_extent(k);
-
-       extent_for_each_ptr_crc(e, ptr, crc)
-               if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-
-                       if (!atomic64_add_return(crc.compressed_size,
-                                                &ca->rebalance_work))
-                               rebalance_wakeup(c);
-               }
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
-       if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
-               rebalance_wakeup(c);
-}
-
-static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
-                                   enum bkey_type type,
-                                   struct bkey_s_c_extent e,
-                                   struct bch_io_opts *io_opts,
-                                   struct data_opts *data_opts)
-{
-       const struct bch_extent_ptr *ptr;
-       struct bch_extent_crc_unpacked crc;
-
-       /* Make sure we have room to add a new pointer: */
-       if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-           BKEY_EXTENT_VAL_U64s_MAX)
-               return DATA_SKIP;
-
-       extent_for_each_ptr_crc(e, ptr, crc)
-               if (rebalance_ptr_pred(c, ptr, crc, io_opts))
-                       goto found;
-
-       return DATA_SKIP;
-found:
-       data_opts->target               = io_opts->background_target;
-       data_opts->btree_insert_flags   = 0;
-       return DATA_ADD_REPLICAS;
-}
-
-struct rebalance_work {
-       unsigned        dev_most_full_percent;
-       u64             dev_most_full_work;
-       u64             dev_most_full_capacity;
-       u64             total_work;
-};
-
-static struct rebalance_work rebalance_work(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       struct rebalance_work ret = { 0 };
-       unsigned i;
-
-       for_each_online_member(ca, c, i) {
-               u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
-                                               ca->mi.first_bucket);
-               u64 work = atomic64_read(&ca->rebalance_work) +
-                       atomic64_read(&c->rebalance_work_unknown_dev);
-               unsigned percent_full = div_u64(work * 100, capacity);
-
-               if (percent_full > ret.dev_most_full_percent) {
-                       ret.dev_most_full_percent       = percent_full;
-                       ret.dev_most_full_work          = work;
-                       ret.dev_most_full_capacity      = capacity;
-               }
-
-               ret.total_work += atomic64_read(&ca->rebalance_work);
-       }
-
-       ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
-
-       return ret;
-}
-
-static void rebalance_work_reset(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       unsigned i;
-
-       for_each_online_member(ca, c, i)
-               atomic64_set(&ca->rebalance_work, 0);
-
-       atomic64_set(&c->rebalance_work_unknown_dev, 0);
-}
-
-static unsigned long curr_cputime(void)
-{
-       u64 utime, stime;
-
-       task_cputime_adjusted(current, &utime, &stime);
-       return nsecs_to_jiffies(utime + stime);
-}
-
-static int bch2_rebalance_thread(void *arg)
-{
-       struct bch_fs *c = arg;
-       struct io_clock *clock = &c->io_clock[WRITE];
-       struct rebalance_work w, p;
-       unsigned long start, prev_start;
-       unsigned long prev_run_time, prev_run_cputime;
-       unsigned long cputime, prev_cputime;
-
-       set_freezable();
-
-       p               = rebalance_work(c);
-       prev_start      = jiffies;
-       prev_cputime    = curr_cputime();
-
-       while (!kthread_wait_freezable(c->rebalance_enabled)) {
-               struct bch_move_stats move_stats = { 0 };
-
-               w                       = rebalance_work(c);
-               start                   = jiffies;
-               cputime                 = curr_cputime();
-
-               prev_run_time           = start - prev_start;
-               prev_run_cputime        = cputime - prev_cputime;
-
-               if (!w.total_work) {
-                       kthread_wait_freezable(rebalance_work(c).total_work);
-                       continue;
-               }
-
-               if (w.dev_most_full_percent < 20 &&
-                   prev_run_cputime * 5 > prev_run_time) {
-                       if (w.dev_most_full_capacity) {
-                               bch2_kthread_io_clock_wait(clock,
-                                       atomic_long_read(&clock->now) +
-                                       div_u64(w.dev_most_full_capacity, 5));
-                       } else {
-
-                               set_current_state(TASK_INTERRUPTIBLE);
-                               if (kthread_should_stop())
-                                       break;
-
-                               schedule_timeout(prev_run_cputime * 5 -
-                                                prev_run_time);
-                               continue;
-                       }
-               }
-
-               /* minimum 1 mb/sec: */
-               c->rebalance_pd.rate.rate =
-                       max_t(u64, 1 << 11,
-                             c->rebalance_pd.rate.rate *
-                             max(p.dev_most_full_percent, 1U) /
-                             max(w.dev_most_full_percent, 1U));
-
-               rebalance_work_reset(c);
-
-               bch2_move_data(c, &c->rebalance_pd.rate,
-                              writepoint_ptr(&c->rebalance_write_point),
-                              POS_MIN, POS_MAX,
-                              rebalance_pred, NULL,
-                              &move_stats);
-       }
-
-       return 0;
-}
-
-void bch2_rebalance_stop(struct bch_fs *c)
-{
-       struct task_struct *p;
-
-       c->rebalance_pd.rate.rate = UINT_MAX;
-       bch2_ratelimit_reset(&c->rebalance_pd.rate);
-
-       p = c->rebalance_thread;
-       c->rebalance_thread = NULL;
-
-       if (p) {
-               /* for sychronizing with rebalance_wakeup() */
-               synchronize_rcu();
-
-               kthread_stop(p);
-               put_task_struct(p);
-       }
-}
-
-int bch2_rebalance_start(struct bch_fs *c)
-{
-       struct task_struct *p;
-
-       if (c->opts.nochanges)
-               return 0;
-
-       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
-       if (IS_ERR(p))
-               return PTR_ERR(p);
-
-       get_task_struct(p);
-
-       rcu_assign_pointer(c->rebalance_thread, p);
-       wake_up_process(c->rebalance_thread);
-       return 0;
-}
-
-void bch2_fs_rebalance_init(struct bch_fs *c)
-{
-       bch2_pd_controller_init(&c->rebalance_pd);
-
-       atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
-}
index 1f2c23b988affb7d4809a191aa54d89001a42e4d..60e1f1ff44eb10fe524b32348fabf4d3922ffa79 100644 (file)
@@ -203,7 +203,7 @@ bool bch2_is_zero(const void *_p, size_t n)
        return true;
 }
 
-void bch2_quantiles_update(struct quantiles *q, u64 v)
+static void bch2_quantiles_update(struct quantiles *q, u64 v)
 {
        unsigned i = 0;
 
@@ -569,6 +569,23 @@ start:             bv->bv_len      = min_t(size_t, PAGE_SIZE - bv->bv_offset,
        }
 }
 
+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+       int i;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp_mask);
+               if (!bv->bv_page) {
+                       while (--bv >= bio->bi_io_vec)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
 size_t bch2_rand_range(size_t max)
 {
        size_t rand;
@@ -771,20 +788,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
        }
 }
 
-void mempool_free_vp(void *element, void *pool_data)
+static void mempool_free_vp(void *element, void *pool_data)
 {
        size_t size = (size_t) pool_data;
 
        vpfree(element, size);
 }
 
-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
 {
        size_t size = (size_t) pool_data;
 
        return vpmalloc(size, gfp_mask);
 }
 
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+       return size < PAGE_SIZE
+               ? mempool_init_kmalloc_pool(pool, min_nr, size)
+               : mempool_init(pool, min_nr, mempool_alloc_vp,
+                              mempool_free_vp, (void *) size);
+}
+
 #if 0
 void eytzinger1_test(void)
 {
index 7c7264f4f5951a4a72ffc238e0ca7140f345dc0c..184915593e866aa89f4acae896c26c0f4328de8e 100644 (file)
@@ -68,9 +68,9 @@ struct closure;
 #define __flatten
 #endif
 
-#ifdef __LITTLE_ENDIAN
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #define CPU_BIG_ENDIAN         0
-#else
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #define CPU_BIG_ENDIAN         1
 #endif
 
@@ -113,14 +113,7 @@ static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
                : vpmalloc(size, gfp_mask);
 }
 
-void mempool_free_vp(void *element, void *pool_data);
-void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data);
-
-static inline int mempool_init_vp_pool(mempool_t *pool, int min_nr, size_t size)
-{
-       return mempool_init(pool, min_nr, mempool_alloc_vp,
-                           mempool_free_vp, (void *) size);
-}
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
 
 #define HEAP(type)                                                     \
 struct {                                                               \
@@ -610,6 +603,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 
 void bch2_bio_map(struct bio *bio, void *base);
+int bch2_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
index 79a98f757cc950a940c58b1ea373c524706eb8a8..c89c7200a1b483a33cfc40b8b5d77c9a7aab6382 100644 (file)
@@ -5,8 +5,8 @@
 #include "compress.h"
 #include "extents.h"
 #include "fs.h"
+#include "rebalance.h"
 #include "str_hash.h"
-#include "tier.h"
 #include "xattr.h"
 
 #include <linux/dcache.h>
index 2d61c480d5532c2074ecbdd181bab7e0e083d802..de6eb1428ebcf038a30a44c3faa88d6cb8c49584 100644 (file)
@@ -40,14 +40,22 @@ void schedule(void)
                      v, NULL, NULL, 0);
 }
 
-static void process_timeout(unsigned long __data)
+struct process_timer {
+       struct timer_list timer;
+       struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
 {
-       wake_up_process((struct task_struct *)__data);
+       struct process_timer *timeout =
+               container_of(t, struct process_timer, timer);
+
+       wake_up_process(timeout->task);
 }
 
 long schedule_timeout(long timeout)
 {
-       struct timer_list timer;
+       struct process_timer timer;
        unsigned long expire;
 
        switch (timeout)
@@ -80,10 +88,11 @@ long schedule_timeout(long timeout)
 
        expire = timeout + jiffies;
 
-       setup_timer(&timer, process_timeout, (unsigned long)current);
-       mod_timer(&timer, expire);
+       timer.task = current;
+       timer_setup_on_stack(&timer.timer, process_timeout, 0);
+       mod_timer(&timer.timer, expire);
        schedule();
-       del_timer_sync(&timer);
+       del_timer_sync(&timer.timer);
 
        timeout = expire - jiffies;
 out:
index b67a54ac224d5008ea3f436576874080b3291380..dd5aba18b7f025f47c62e8c42e441119791fc780 100644 (file)
@@ -273,7 +273,7 @@ static int timer_thread(void *arg)
                        BUG_ON(!timer_running());
 
                        pthread_mutex_unlock(&timer_lock);
-                       timer->function(timer->data);
+                       timer->function(timer);
                        pthread_mutex_lock(&timer_lock);
 
                        timer_seq++;
index f5942772dda67adf02723b2dd6620ec868c3a0b3..4dfd6cd9e2099ad4db1d7c109f07fb0f91bb40af 100644 (file)
@@ -55,9 +55,10 @@ bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
        return ret;
 }
 
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *timer)
 {
-       struct delayed_work *dwork = (struct delayed_work *) __data;
+       struct delayed_work *dwork =
+               container_of(timer, struct delayed_work, timer);
 
        pthread_mutex_lock(&wq_lock);
        __queue_work(dwork->wq, &dwork->work);
@@ -71,8 +72,7 @@ static void __queue_delayed_work(struct workqueue_struct *wq,
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;
 
-       BUG_ON(timer->function != delayed_work_timer_fn ||
-              timer->data != (unsigned long)dwork);
+       BUG_ON(timer->function != delayed_work_timer_fn);
        BUG_ON(timer_pending(timer));
        BUG_ON(!list_empty(&work->entry));