]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/bcachefs.h
Update bcachefs sources to c9b4a210f9 fixup! bcachefs: Fixes for going RO
[bcachefs-tools-debian] / libbcachefs / bcachefs.h
index 9d04e894175b3e73c2d847bf1410f8f730f84236..72d8ef77907b0af8696415e23f74e49a319949d2 100644 (file)
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_H
 #define _BCACHEFS_H
 
 #include <linux/bio.h>
 #include <linux/closure.h>
 #include <linux/kobject.h>
-#include <linux/lglock.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
-#include <linux/radix-tree.h>
-#include <linux/rbtree.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/rhashtable.h>
 #include <linux/rwsem.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/zstd.h>
 
 #include "bcachefs_format.h"
-#include "bset.h"
 #include "fifo.h"
 #include "opts.h"
 #include "util.h"
 
 #include <linux/dynamic_fault.h>
 
-#define bch2_fs_init_fault(name)                                               \
+#define bch2_fs_init_fault(name)                                       \
        dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)                                     \
         dynamic_fault("bcachefs:meta:read:" name)
        printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
        printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+       printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
        printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_err_ratelimited(c, fmt, ...) \
+       printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 
 #define bch_verbose(c, fmt, ...)                                       \
 do {                                                                   \
-       if ((c)->opts.verbose_recovery)                                 \
+       if ((c)->opts.verbose)                                          \
                bch_info(c, fmt, ##__VA_ARGS__);                        \
 } while (0)
 
+#define pr_verbose_init(opts, fmt, ...)                                        \
+do {                                                                   \
+       if (opt_get(opts, verbose))                                     \
+               pr_info(fmt, ##__VA_ARGS__);                            \
+} while (0)
+
 /* Parameters that are useful for debugging, but should always be compiled in: */
 #define BCH_DEBUG_PARAMS_ALWAYS()                                      \
        BCH_DEBUG_PARAM(key_merging_disabled,                           \
@@ -248,16 +258,32 @@ do {                                                                      \
        BCH_DEBUG_PARAM(expensive_debug_checks,                         \
                "Enables various runtime debugging checks that "        \
                "significantly affect performance")                     \
+       BCH_DEBUG_PARAM(debug_check_iterators,                          \
+               "Enables extra verification for btree iterators")       \
        BCH_DEBUG_PARAM(debug_check_bkeys,                              \
                "Run bkey_debugcheck (primarily checking GC/allocation "\
                "information) when iterating over keys")                \
-       BCH_DEBUG_PARAM(version_stress_test,                            \
-               "Assigns random version numbers to newly written "      \
-               "extents, to test overlapping extent cases")            \
        BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
                "Reread btree nodes at various points to verify the "   \
                "mergesort in the read path against modifications "     \
                "done in memory")                                       \
+       BCH_DEBUG_PARAM(journal_seq_verify,                             \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")         \
+       BCH_DEBUG_PARAM(inject_invalid_keys,                            \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")         \
+       BCH_DEBUG_PARAM(test_alloc_startup,                             \
+               "Force allocator startup to use the slowpath where it"  \
+               "can't find enough free buckets without invalidating"   \
+               "cached data")                                          \
+       BCH_DEBUG_PARAM(force_reconstruct_read,                         \
+               "Force reads to use the reconstruct path, when reading" \
+               "from erasure coded extents")                           \
+       BCH_DEBUG_PARAM(test_restart_gc,                                \
+               "Test restarting mark and sweep gc when bucket gens change")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -267,54 +293,73 @@ do {                                                                      \
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
-/* name, frequency_units, duration_units */
-#define BCH_TIME_STATS()                                               \
-       BCH_TIME_STAT(btree_node_mem_alloc,     sec, us)                \
-       BCH_TIME_STAT(btree_gc,                 sec, ms)                \
-       BCH_TIME_STAT(btree_split,              sec, us)                \
-       BCH_TIME_STAT(btree_sort,               ms, us)                 \
-       BCH_TIME_STAT(btree_read,               ms, us)                 \
-       BCH_TIME_STAT(journal_write,            us, us)                 \
-       BCH_TIME_STAT(journal_delay,            ms, us)                 \
-       BCH_TIME_STAT(journal_blocked,          sec, ms)                \
-       BCH_TIME_STAT(journal_flush_seq,        us, us)
+#define BCH_TIME_STATS()                       \
+       x(btree_node_mem_alloc)                 \
+       x(btree_node_split)                     \
+       x(btree_node_sort)                      \
+       x(btree_node_read)                      \
+       x(btree_gc)                             \
+       x(btree_lock_contended_read)            \
+       x(btree_lock_contended_intent)          \
+       x(btree_lock_contended_write)           \
+       x(data_write)                           \
+       x(data_read)                            \
+       x(data_promote)                         \
+       x(journal_write)                        \
+       x(journal_delay)                        \
+       x(journal_flush_seq)                    \
+       x(blocked_journal)                      \
+       x(blocked_allocate)                     \
+       x(blocked_allocate_open_bucket)
+
+enum bch_time_stats {
+#define x(name) BCH_TIME_##name,
+       BCH_TIME_STATS()
+#undef x
+       BCH_TIME_STAT_NR
+};
 
 #include "alloc_types.h"
+#include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
-#include "io_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
-#include "move_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "replicas_types.h"
 #include "super_types.h"
 
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX            512
-
-/*
- * Number of nodes we might have to allocate in a worst case btree split
- * operation - we split all the way up to the root, then allocate a new root.
- */
-#define btree_reserve_required_nodes(depth)    (((depth) + 1) * 2 + 1)
-
 /* Number of nodes btree coalesce will try to coalesce at once */
 #define GC_MERGE_NODES         4U
 
 /* Maximum number of nodes we might need to allocate atomically: */
-#define BTREE_RESERVE_MAX                                              \
-       (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
+#define BTREE_RESERVE_MAX      (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE             (BTREE_RESERVE_MAX * 4)
+#define BTREE_NODE_RESERVE     BTREE_RESERVE_MAX
+
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
 
 struct btree;
-struct crypto_blkcipher;
-struct crypto_ahash;
 
 enum gc_phase {
-       GC_PHASE_SB_METADATA            = BTREE_ID_NR + 1,
+       GC_PHASE_NOT_RUNNING,
+       GC_PHASE_START,
+       GC_PHASE_SB,
+
+       GC_PHASE_BTREE_EC,
+       GC_PHASE_BTREE_EXTENTS,
+       GC_PHASE_BTREE_INODES,
+       GC_PHASE_BTREE_DIRENTS,
+       GC_PHASE_BTREE_XATTRS,
+       GC_PHASE_BTREE_ALLOC,
+       GC_PHASE_BTREE_QUOTAS,
+       GC_PHASE_BTREE_REFLINK,
+
        GC_PHASE_PENDING_DELETE,
-       GC_PHASE_DONE
+       GC_PHASE_ALLOC,
 };
 
 struct gc_pos {
@@ -323,35 +368,16 @@ struct gc_pos {
        unsigned                level;
 };
 
-struct bch_member_cpu {
-       u64                     nbuckets;       /* device size */
-       u16                     first_bucket;   /* index of first bucket used */
-       u16                     bucket_size;    /* sectors */
-       u8                      state;
-       u8                      tier;
-       u8                      replacement;
-       u8                      discard;
-       u8                      valid;
-};
-
-struct bch_replicas_cpu_entry {
-       u8                      data_type;
-       u8                      devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-       struct rcu_head         rcu;
-       unsigned                nr;
-       unsigned                entry_size;
-       struct bch_replicas_cpu_entry entries[];
+struct io_count {
+       u64                     sectors[2][BCH_DATA_NR];
 };
 
 struct bch_dev {
        struct kobject          kobj;
        struct percpu_ref       ref;
+       struct completion       ref_completion;
        struct percpu_ref       io_ref;
-       struct completion       stop_complete;
-       struct completion       offline_complete;
+       struct completion       io_ref_completion;
 
        struct bch_fs           *fs;
 
@@ -364,14 +390,29 @@ struct bch_dev {
        uuid_le                 uuid;
        char                    name[BDEVNAME_SIZE];
 
-       struct bcache_superblock disk_sb;
+       struct bch_sb_handle    disk_sb;
+       struct bch_sb           *sb_read_scratch;
+       int                     sb_write_error;
 
-       struct dev_group        self;
+       struct bch_devs_mask    self;
 
-       /* biosets used in cloned bios for replicas and moving_gc */
+       /* biosets used in cloned bios for writing multiple replicas */
        struct bio_set          replica_set;
 
-       struct task_struct      *alloc_thread;
+       /*
+        * Buckets:
+        * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
+        * gc_lock, for device resize - holding any is sufficient for access:
+        * Or rcu_read_lock(), but only for ptr_stale():
+        */
+       struct bucket_array __rcu *buckets[2];
+       unsigned long           *buckets_nouse;
+       struct rw_semaphore     bucket_lock;
+
+       struct bch_dev_usage __percpu *usage[2];
+
+       /* Allocator: */
+       struct task_struct __rcu *alloc_thread;
 
        /*
         * free: Buckets that are ready to be used
@@ -382,82 +423,84 @@ struct bch_dev {
         * gens/prios, they'll be moved to the free list (and possibly discarded
         * in the process)
         */
-       DECLARE_FIFO(long, free)[RESERVE_NR];
-       DECLARE_FIFO(long, free_inc);
-       spinlock_t              freelist_lock;
-       unsigned                nr_invalidated;
-       bool                    alloc_thread_started;
-       bool                    need_alloc_write;
-
-       size_t                  fifo_last_bucket;
+       alloc_fifo              free[RESERVE_NR];
+       alloc_fifo              free_inc;
 
-       /* Allocation stuff: */
+       u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
+       unsigned                open_buckets_partial_nr;
 
-       /* most out of date gen in the btree */
-       u8                      *oldest_gens;
-       struct bucket           *buckets;
-       unsigned short          bucket_bits;    /* ilog2(bucket_size) */
+       size_t                  fifo_last_bucket;
 
        /* last calculated minimum prio */
-       u16                     min_prio[2];
+       u16                     max_last_bucket_io[2];
 
-       /*
-        * Bucket book keeping. The first element is updated by GC, the
-        * second contains a saved copy of the stats from the beginning
-        * of GC.
-        */
-       struct bch_dev_usage __percpu *usage_percpu;
-       struct bch_dev_usage    usage_cached;
-
-       atomic_long_t           saturated_count;
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
-       u64                     allocator_journal_seq_flush;
-       bool                    allocator_invalidating_data;
-
-       alloc_heap              alloc_heap;
-       bucket_heap             copygc_heap;
 
-       /* Moving GC: */
-       struct task_struct      *moving_gc_read;
-
-       struct bch_pd_controller moving_gc_pd;
+       /*
+        * XXX: this should be an enum for allocator state, so as to include
+        * error state
+        */
+       enum {
+               ALLOCATOR_STOPPED,
+               ALLOCATOR_RUNNING,
+               ALLOCATOR_BLOCKED,
+               ALLOCATOR_BLOCKED_FULL,
+       }                       allocator_state;
 
-       /* Tiering: */
-       struct write_point      tiering_write_point;
+       alloc_heap              alloc_heap;
 
+       /* Copying GC: */
+       struct task_struct      *copygc_thread;
+       copygc_heap             copygc_heap;
+       struct bch_pd_controller copygc_pd;
        struct write_point      copygc_write_point;
+       u64                     copygc_threshold;
+
+       atomic64_t              rebalance_work;
 
        struct journal_device   journal;
 
        struct work_struct      io_error_work;
 
        /* The rest of this all shows up in sysfs */
-       atomic64_t              meta_sectors_written;
-       atomic64_t              btree_sectors_written;
-       u64 __percpu            *sectors_written;
+       atomic64_t              cur_latency[2];
+       struct time_stats       io_latency[2];
+
+#define CONGESTED_MAX          1024
+       atomic_t                congested;
+       u64                     congested_last;
+
+       struct io_count __percpu *io_done;
 };
 
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
 enum {
+       /* startup: */
+       BCH_FS_ALLOC_READ_DONE,
+       BCH_FS_ALLOC_CLEAN,
+       BCH_FS_ALLOCATOR_STARTED,
+       BCH_FS_ALLOCATOR_RUNNING,
+       BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
+       BCH_FS_FSCK_DONE,
+       BCH_FS_STARTED,
+       BCH_FS_RW,
+
+       /* shutdown: */
+       BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
-       BCH_FS_GC_STOPPING,
-       BCH_FS_GC_FAILURE,
-       BCH_FS_BDEV_MOUNTED,
+
+       /* errors: */
        BCH_FS_ERROR,
-       BCH_FS_FSCK_FIXED_ERRORS,
-       BCH_FS_FSCK_DONE,
+       BCH_FS_ERRORS_FIXED,
+
+       /* misc: */
+       BCH_FS_BDEV_MOUNTED,
        BCH_FS_FIXED_GENS,
+       BCH_FS_ALLOC_WRITTEN,
        BCH_FS_REBUILD_REPLICAS,
+       BCH_FS_HOLD_BTREE_WRITES,
 };
 
 struct btree_debug {
@@ -467,19 +510,29 @@ struct btree_debug {
        struct dentry           *failed;
 };
 
-struct bch_tier {
-       unsigned                idx;
-       struct task_struct      *migrate;
-       struct bch_pd_controller pd;
+struct bch_fs_pcpu {
+       u64                     sectors_available;
+};
 
-       struct dev_group        devs;
+struct journal_seq_blacklist_table {
+       size_t                  nr;
+       struct journal_seq_blacklist_table_entry {
+               u64             start;
+               u64             end;
+               bool            dirty;
+       }                       entries[0];
 };
 
-enum bch_fs_state {
-       BCH_FS_STARTING         = 0,
-       BCH_FS_STOPPING,
-       BCH_FS_RO,
-       BCH_FS_RW,
+struct journal_keys {
+       struct journal_key {
+               enum btree_id   btree_id:8;
+               unsigned        level:8;
+               struct bkey_i   *k;
+               u32             journal_seq;
+               u32             journal_offset;
+       }                       *d;
+       size_t                  nr;
+       u64                     journal_seq_base;
 };
 
 struct bch_fs {
@@ -499,7 +552,6 @@ struct bch_fs {
 
        /* ro/rw, add/remove devices: */
        struct mutex            state_lock;
-       enum bch_fs_state       state;
 
        /* Counts outstanding writes, for clean transition to read-only */
        struct percpu_ref       writes;
@@ -507,10 +559,14 @@ struct bch_fs {
 
        struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
 
-       struct bch_replicas_cpu __rcu *replicas;
-       struct bch_replicas_cpu __rcu *replicas_gc;
+       struct bch_replicas_cpu replicas;
+       struct bch_replicas_cpu replicas_gc;
        struct mutex            replicas_gc_lock;
 
+       struct journal_entry_res replicas_journal_res;
+
+       struct bch_disk_groups_cpu __rcu *disk_groups;
+
        struct bch_opts         opts;
 
        /* Updated by bch2_sb_update():*/
@@ -518,72 +574,37 @@ struct bch_fs {
                uuid_le         uuid;
                uuid_le         user_uuid;
 
-               u16             block_size;
-               u16             btree_node_size;
+               u16             version;
+               u16             encoded_extent_max;
 
                u8              nr_devices;
                u8              clean;
 
-               u8              str_hash_type;
                u8              encryption_type;
 
                u64             time_base_lo;
                u32             time_base_hi;
                u32             time_precision;
+               u64             features;
+               u64             compat;
        }                       sb;
 
-       struct bch_sb           *disk_sb;
-       unsigned                disk_sb_order;
+       struct bch_sb_handle    disk_sb;
 
        unsigned short          block_bits;     /* ilog2(block_size) */
 
+       u16                     btree_foreground_merge_threshold;
+
        struct closure          sb_write;
        struct mutex            sb_lock;
 
-       struct backing_dev_info bdi;
-
        /* BTREE CACHE */
-       struct bio_set          btree_read_bio;
+       struct bio_set          btree_bio;
 
        struct btree_root       btree_roots[BTREE_ID_NR];
        struct mutex            btree_root_lock;
 
-       bool                    btree_cache_table_init_done;
-       struct rhashtable       btree_cache_table;
-
-       /*
-        * We never free a struct btree, except on shutdown - we just put it on
-        * the btree_cache_freed list and reuse it later. This simplifies the
-        * code, and it doesn't cost us much memory as the memory usage is
-        * dominated by buffers that hold the actual btree node data and those
-        * can be freed - and the number of struct btrees allocated is
-        * effectively bounded.
-        *
-        * btree_cache_freeable effectively is a small cache - we use it because
-        * high order page allocations can be rather expensive, and it's quite
-        * common to delete and allocate btree nodes in quick succession. It
-        * should never grow past ~2-3 nodes in practice.
-        */
-       struct mutex            btree_cache_lock;
-       struct list_head        btree_cache;
-       struct list_head        btree_cache_freeable;
-       struct list_head        btree_cache_freed;
-
-       /* Number of elements in btree_cache + btree_cache_freeable lists */
-       unsigned                btree_cache_used;
-       unsigned                btree_cache_reserve;
-       struct shrinker         btree_cache_shrink;
-
-       /*
-        * If we need to allocate memory for a new btree node and that
-        * allocation fails, we can cannibalize another node in the btree cache
-        * to satisfy the allocation - lock to guarantee only one thread does
-        * this at a time:
-        */
-       struct closure_waitlist mca_wait;
-       struct task_struct      *btree_cache_alloc_lock;
-
-       mempool_t               btree_reserve_pool;
+       struct btree_cache      btree_cache;
 
        /*
         * Cache of allocated btree nodes - if we allocate a btree node and
@@ -592,39 +613,34 @@ struct bch_fs {
         * when allocating btree reserves fail halfway through) - instead, we
         * can stick them here:
         */
-       struct btree_alloc {
-               struct open_bucket      *ob;
-               BKEY_PADDED(k);
-       }                       btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+       struct btree_alloc      btree_reserve_cache[BTREE_NODE_RESERVE * 2];
        unsigned                btree_reserve_cache_nr;
        struct mutex            btree_reserve_cache_lock;
 
        mempool_t               btree_interior_update_pool;
        struct list_head        btree_interior_update_list;
+       struct list_head        btree_interior_updates_unwritten;
        struct mutex            btree_interior_update_lock;
+       struct closure_waitlist btree_interior_update_wait;
+
+       struct workqueue_struct *btree_interior_update_worker;
+       struct work_struct      btree_interior_update_work;
+
+       /* btree_iter.c: */
+       struct mutex            btree_trans_lock;
+       struct list_head        btree_trans_list;
+       mempool_t               btree_iters_pool;
 
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
+       struct workqueue_struct *journal_reclaim_wq;
 
        /* ALLOCATION */
-       struct bch_pd_controller foreground_write_pd;
        struct delayed_work     pd_controllers_update;
        unsigned                pd_controllers_update_seconds;
-       spinlock_t              foreground_write_pd_lock;
-       struct bch_write_op     *write_wait_head;
-       struct bch_write_op     *write_wait_tail;
 
-       struct timer_list       foreground_write_wakeup;
-
-       /*
-        * These contain all r/w devices - i.e. devices we can currently
-        * allocate from:
-        */
-       struct dev_group        all_devs;
-       struct bch_tier         tiers[BCH_TIER_MAX];
-       /* NULL if we only have devices in one tier: */
-       struct bch_tier         *fastest_tier;
+       struct bch_devs_mask    rw_devs[BCH_DATA_NR];
 
        u64                     capacity; /* sectors */
 
@@ -634,16 +650,22 @@ struct bch_fs {
         * and forces them to be revalidated
         */
        u32                     capacity_gen;
+       unsigned                bucket_size_max;
 
        atomic64_t              sectors_available;
 
-       struct bch_fs_usage __percpu *usage_percpu;
-       struct bch_fs_usage     usage_cached;
-       struct lglock           usage_lock;
+       struct bch_fs_pcpu __percpu     *pcpu;
 
-       struct mutex            bucket_lock;
+       struct percpu_rw_semaphore      mark_lock;
 
-       struct closure_waitlist freelist_wait;
+       seqcount_t                      usage_lock;
+       struct bch_fs_usage             *usage_base;
+       struct bch_fs_usage __percpu    *usage[2];
+       struct bch_fs_usage __percpu    *usage_gc;
+
+       /* single element mempool: */
+       struct mutex            usage_scratch_lock;
+       struct bch_fs_usage     *usage_scratch;
 
        /*
         * When we invalidate buckets, we use both the priority and the amount
@@ -651,31 +673,32 @@ struct bch_fs {
         * those together consistently we keep track of the smallest nonzero
         * priority of any bucket.
         */
-       struct prio_clock       prio_clock[2];
+       struct bucket_clock     bucket_clock[2];
 
        struct io_clock         io_clock[2];
 
-       /* SECTOR ALLOCATOR */
-       struct list_head        open_buckets_open;
-       struct list_head        open_buckets_free;
-       unsigned                open_buckets_nr_free;
+       /* JOURNAL SEQ BLACKLIST */
+       struct journal_seq_blacklist_table *
+                               journal_seq_blacklist_table;
+       struct work_struct      journal_seq_blacklist_gc_work;
+
+       /* ALLOCATOR */
+       spinlock_t              freelist_lock;
+       struct closure_waitlist freelist_wait;
+       u64                     blocked_allocate;
+       u64                     blocked_allocate_open_bucket;
+       u8                      open_buckets_freelist;
+       u8                      open_buckets_nr_free;
        struct closure_waitlist open_buckets_wait;
-       spinlock_t              open_buckets_lock;
        struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
 
        struct write_point      btree_write_point;
+       struct write_point      rebalance_write_point;
 
-       struct write_point      write_points[WRITE_POINT_COUNT];
-       struct write_point      promote_write_point;
-
-       /*
-        * This write point is used for migrating data off a device
-        * and can point to any other device.
-        * We can't use the normal write points because those will
-        * gang up n replicas, and for migration we want only one new
-        * replica.
-        */
-       struct write_point      migration_write_point;
+       struct write_point      write_points[WRITE_POINT_MAX];
+       struct hlist_head       write_points_hash[WRITE_POINT_HASH_NR];
+       struct mutex            write_points_hash_lock;
+       unsigned                write_points_nr;
 
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
@@ -688,9 +711,6 @@ struct bch_fs {
         *
         * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
         *
-        * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-        * currently running, and gc marks are currently valid
-        *
         * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
         * can read without a lock.
         */
@@ -709,32 +729,60 @@ struct bch_fs {
        struct bio_set          bio_write;
        struct mutex            bio_bounce_pages_lock;
        mempool_t               bio_bounce_pages;
+       struct rhashtable       promote_table;
 
-       mempool_t               lz4_workspace_pool;
-       void                    *zlib_workspace;
-       struct mutex            zlib_workspace_lock;
        mempool_t               compression_bounce[2];
+       mempool_t               compress_workspace[BCH_COMPRESSION_TYPE_NR];
+       mempool_t               decompress_workspace;
+       ZSTD_parameters         zstd_params;
 
        struct crypto_shash     *sha256;
-       struct crypto_skcipher  *chacha20;
+       struct crypto_sync_skcipher *chacha20;
        struct crypto_shash     *poly1305;
 
        atomic64_t              key_version;
 
-       struct bio_list         read_retry_list;
-       struct work_struct      read_retry_work;
-       spinlock_t              read_retry_lock;
+       mempool_t               large_bkey_pool;
+
+       /* REBALANCE */
+       struct bch_fs_rebalance rebalance;
+
+       /* STRIPES: */
+       GENRADIX(struct stripe) stripes[2];
+       struct mutex            ec_stripe_create_lock;
+
+       ec_stripes_heap         ec_stripes_heap;
+       spinlock_t              ec_stripes_heap_lock;
+
+       /* ERASURE CODING */
+       struct list_head        ec_new_stripe_list;
+       struct mutex            ec_new_stripe_lock;
+       u64                     ec_stripe_hint;
+
+       struct bio_set          ec_bioset;
+
+       struct work_struct      ec_stripe_delete_work;
+       struct llist_head       ec_stripe_delete_list;
+
+       /* REFLINK */
+       u64                     reflink_hint;
+
+       /* VFS IO PATH - fs-io.c */
+       struct bio_set          writepage_bioset;
+       struct bio_set          dio_write_bioset;
+       struct bio_set          dio_read_bioset;
+
+       struct bio_list         btree_write_error_list;
+       struct work_struct      btree_write_error_work;
+       spinlock_t              btree_write_error_lock;
 
        /* ERRORS */
        struct list_head        fsck_errors;
        struct mutex            fsck_error_lock;
        bool                    fsck_alloc_err;
 
-       /* FILESYSTEM */
-       wait_queue_head_t       writeback_wait;
-       atomic_t                writeback_pages;
-       unsigned                writeback_pages_max;
-       atomic_long_t           nr_inodes;
+       /* QUOTAS */
+       struct bch_memquota_type quotas[QTYP_NR];
 
        /* DEBUG JUNK */
        struct dentry           *debug;
@@ -756,42 +804,33 @@ struct bch_fs {
        mempool_t               btree_bounce_pool;
 
        struct journal          journal;
+       struct list_head        journal_entries;
+       struct journal_keys     journal_keys;
 
-       unsigned                bucket_journal_seq;
+       u64                     last_bucket_seq_cleanup;
 
        /* The rest of this all shows up in sysfs */
        atomic_long_t           read_realloc_races;
+       atomic_long_t           extent_migrate_done;
+       atomic_long_t           extent_migrate_raced;
 
        unsigned                btree_gc_periodic:1;
-       unsigned                foreground_write_ratelimit_enabled:1;
        unsigned                copy_gc_enabled:1;
-       unsigned                tiering_enabled:1;
-       unsigned                tiering_percent;
-
-       /*
-        * foreground writes will be throttled when the number of free
-        * buckets is below this percentage
-        */
-       unsigned                foreground_target_percent;
+       bool                    promote_whole_extents;
 
 #define BCH_DEBUG_PARAM(name, description) bool name;
        BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM
 
-#define BCH_TIME_STAT(name, frequency_units, duration_units)           \
-       struct time_stats       name##_time;
-       BCH_TIME_STATS()
-#undef BCH_TIME_STAT
+       struct time_stats       times[BCH_TIME_STAT_NR];
 };
 
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
-       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
-static inline unsigned bucket_pages(const struct bch_dev *ca)
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 {
-       return ca->mi.bucket_size / PAGE_SECTORS;
+#ifndef NO_BCACHEFS_FS
+       if (c->vfs_sb)
+               c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
 }
 
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
@@ -801,7 +840,35 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca)
 
 static inline unsigned block_bytes(const struct bch_fs *c)
 {
-       return c->sb.block_size << 9;
+       return c->opts.block_size << 9;
+}
+
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+       return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+       s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+       if (c->sb.time_precision == 1)
+               return ns;
+
+       return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+       struct timespec64 now;
+
+       ktime_get_coarse_real_ts64(&now);
+       return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+       return dev < c->sb.nr_devices && c->devs[dev];
 }
 
 #endif /* _BCACHEFS_H */