]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/bcachefs.h
Update bcachefs sources to c9b4a210f9 fixup! bcachefs: Fixes for going RO
[bcachefs-tools-debian] / libbcachefs / bcachefs.h
index 449eb0c1ce6116e682c38a0a64c8da622fd2e6d7..72d8ef77907b0af8696415e23f74e49a319949d2 100644 (file)
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_H
 #define _BCACHEFS_H
 
 #include <linux/closure.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
+#include <linux/math64.h>
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
        printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
        printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+       printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err(c, fmt, ...) \
        printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err_ratelimited(c, fmt, ...) \
@@ -254,6 +258,8 @@ do {                                                                        \
        BCH_DEBUG_PARAM(expensive_debug_checks,                         \
                "Enables various runtime debugging checks that "        \
                "significantly affect performance")                     \
+       BCH_DEBUG_PARAM(debug_check_iterators,                          \
+               "Enables extra verification for btree iterators")       \
        BCH_DEBUG_PARAM(debug_check_bkeys,                              \
                "Run bkey_debugcheck (primarily checking GC/allocation "\
                "information) when iterating over keys")                \
@@ -275,7 +281,9 @@ do {                                                                        \
                "cached data")                                          \
        BCH_DEBUG_PARAM(force_reconstruct_read,                         \
                "Force reads to use the reconstruct path, when reading" \
-               "from erasure coded extents")
+               "from erasure coded extents")                           \
+       BCH_DEBUG_PARAM(test_restart_gc,                                \
+               "Test restarting mark and sweep gc when bucket gens change")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -287,10 +295,10 @@ do {                                                                      \
 
 #define BCH_TIME_STATS()                       \
        x(btree_node_mem_alloc)                 \
+       x(btree_node_split)                     \
+       x(btree_node_sort)                      \
+       x(btree_node_read)                      \
        x(btree_gc)                             \
-       x(btree_split)                          \
-       x(btree_sort)                           \
-       x(btree_read)                           \
        x(btree_lock_contended_read)            \
        x(btree_lock_contended_intent)          \
        x(btree_lock_contended_write)           \
@@ -299,8 +307,10 @@ do {                                                                       \
        x(data_promote)                         \
        x(journal_write)                        \
        x(journal_delay)                        \
-       x(journal_blocked)                      \
-       x(journal_flush_seq)
+       x(journal_flush_seq)                    \
+       x(blocked_journal)                      \
+       x(blocked_allocate)                     \
+       x(blocked_allocate_open_bucket)
 
 enum bch_time_stats {
 #define x(name) BCH_TIME_##name,
@@ -330,6 +340,8 @@ enum bch_time_stats {
 /* Size of the freelist we allocate btree nodes from: */
 #define BTREE_NODE_RESERVE     BTREE_RESERVE_MAX
 
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
 struct btree;
 
 enum gc_phase {
@@ -344,6 +356,7 @@ enum gc_phase {
        GC_PHASE_BTREE_XATTRS,
        GC_PHASE_BTREE_ALLOC,
        GC_PHASE_BTREE_QUOTAS,
+       GC_PHASE_BTREE_REFLINK,
 
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
@@ -378,6 +391,7 @@ struct bch_dev {
        char                    name[BDEVNAME_SIZE];
 
        struct bch_sb_handle    disk_sb;
+       struct bch_sb           *sb_read_scratch;
        int                     sb_write_error;
 
        struct bch_devs_mask    self;
@@ -393,9 +407,6 @@ struct bch_dev {
         */
        struct bucket_array __rcu *buckets[2];
        unsigned long           *buckets_nouse;
-       unsigned long           *buckets_written;
-       /* most out of date gen in the btree */
-       u8                      *oldest_gens;
        struct rw_semaphore     bucket_lock;
 
        struct bch_dev_usage __percpu *usage[2];
@@ -414,7 +425,6 @@ struct bch_dev {
         */
        alloc_fifo              free[RESERVE_NR];
        alloc_fifo              free_inc;
-       spinlock_t              freelist_lock;
 
        u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
        unsigned                open_buckets_partial_nr;
@@ -426,7 +436,17 @@ struct bch_dev {
 
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
-       bool                    allocator_blocked;
+
+       /*
+        * XXX: this should be an enum for allocator state, so as to include
+        * error state
+        */
+       enum {
+               ALLOCATOR_STOPPED,
+               ALLOCATOR_RUNNING,
+               ALLOCATOR_BLOCKED,
+               ALLOCATOR_BLOCKED_FULL,
+       }                       allocator_state;
 
        alloc_heap              alloc_heap;
 
@@ -454,34 +474,31 @@ struct bch_dev {
        struct io_count __percpu *io_done;
 };
 
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
 enum {
        /* startup: */
        BCH_FS_ALLOC_READ_DONE,
+       BCH_FS_ALLOC_CLEAN,
        BCH_FS_ALLOCATOR_STARTED,
+       BCH_FS_ALLOCATOR_RUNNING,
+       BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
+       BCH_FS_RW,
 
        /* shutdown: */
+       BCH_FS_STOPPING,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
 
        /* errors: */
        BCH_FS_ERROR,
+       BCH_FS_ERRORS_FIXED,
 
        /* misc: */
        BCH_FS_BDEV_MOUNTED,
-       BCH_FS_FSCK_FIXED_ERRORS,
-       BCH_FS_FSCK_UNFIXED_ERRORS,
        BCH_FS_FIXED_GENS,
+       BCH_FS_ALLOC_WRITTEN,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
 };
@@ -493,17 +510,31 @@ struct btree_debug {
        struct dentry           *failed;
 };
 
-enum bch_fs_state {
-       BCH_FS_STARTING         = 0,
-       BCH_FS_STOPPING,
-       BCH_FS_RO,
-       BCH_FS_RW,
-};
-
 struct bch_fs_pcpu {
        u64                     sectors_available;
 };
 
+struct journal_seq_blacklist_table {
+       size_t                  nr;
+       struct journal_seq_blacklist_table_entry {
+               u64             start;
+               u64             end;
+               bool            dirty;
+       }                       entries[0];
+};
+
+struct journal_keys {
+       struct journal_key {
+               enum btree_id   btree_id:8;
+               unsigned        level:8;
+               struct bkey_i   *k;
+               u32             journal_seq;
+               u32             journal_offset;
+       }                       *d;
+       size_t                  nr;
+       u64                     journal_seq_base;
+};
+
 struct bch_fs {
        struct closure          cl;
 
@@ -521,7 +552,6 @@ struct bch_fs {
 
        /* ro/rw, add/remove devices: */
        struct mutex            state_lock;
-       enum bch_fs_state       state;
 
        /* Counts outstanding writes, for clean transition to read-only */
        struct percpu_ref       writes;
@@ -533,6 +563,8 @@ struct bch_fs {
        struct bch_replicas_cpu replicas_gc;
        struct mutex            replicas_gc_lock;
 
+       struct journal_entry_res replicas_journal_res;
+
        struct bch_disk_groups_cpu __rcu *disk_groups;
 
        struct bch_opts         opts;
@@ -554,6 +586,7 @@ struct bch_fs {
                u32             time_base_hi;
                u32             time_precision;
                u64             features;
+               u64             compat;
        }                       sb;
 
        struct bch_sb_handle    disk_sb;
@@ -569,13 +602,10 @@ struct bch_fs {
        struct bio_set          btree_bio;
 
        struct btree_root       btree_roots[BTREE_ID_NR];
-       bool                    btree_roots_dirty;
        struct mutex            btree_root_lock;
 
        struct btree_cache      btree_cache;
 
-       mempool_t               btree_reserve_pool;
-
        /*
         * Cache of allocated btree nodes - if we allocate a btree node and
         * don't use it, if we free it that space can't be reused until going
@@ -589,14 +619,22 @@ struct bch_fs {
 
        mempool_t               btree_interior_update_pool;
        struct list_head        btree_interior_update_list;
+       struct list_head        btree_interior_updates_unwritten;
        struct mutex            btree_interior_update_lock;
        struct closure_waitlist btree_interior_update_wait;
 
+       struct workqueue_struct *btree_interior_update_worker;
+       struct work_struct      btree_interior_update_work;
+
+       /* btree_iter.c: */
+       struct mutex            btree_trans_lock;
+       struct list_head        btree_trans_list;
        mempool_t               btree_iters_pool;
 
        struct workqueue_struct *wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
+       struct workqueue_struct *journal_reclaim_wq;
 
        /* ALLOCATION */
        struct delayed_work     pd_controllers_update;
@@ -620,8 +658,14 @@ struct bch_fs {
 
        struct percpu_rw_semaphore      mark_lock;
 
+       seqcount_t                      usage_lock;
+       struct bch_fs_usage             *usage_base;
        struct bch_fs_usage __percpu    *usage[2];
-       struct bch_fs_usage __percpu    *usage_scratch;
+       struct bch_fs_usage __percpu    *usage_gc;
+
+       /* single element mempool: */
+       struct mutex            usage_scratch_lock;
+       struct bch_fs_usage     *usage_scratch;
 
        /*
         * When we invalidate buckets, we use both the priority and the amount
@@ -633,9 +677,16 @@ struct bch_fs {
 
        struct io_clock         io_clock[2];
 
+       /* JOURNAL SEQ BLACKLIST */
+       struct journal_seq_blacklist_table *
+                               journal_seq_blacklist_table;
+       struct work_struct      journal_seq_blacklist_gc_work;
+
        /* ALLOCATOR */
        spinlock_t              freelist_lock;
        struct closure_waitlist freelist_wait;
+       u64                     blocked_allocate;
+       u64                     blocked_allocate_open_bucket;
        u8                      open_buckets_freelist;
        u8                      open_buckets_nr_free;
        struct closure_waitlist open_buckets_wait;
@@ -681,7 +732,7 @@ struct bch_fs {
        struct rhashtable       promote_table;
 
        mempool_t               compression_bounce[2];
-       mempool_t               compress_workspace[BCH_COMPRESSION_NR];
+       mempool_t               compress_workspace[BCH_COMPRESSION_TYPE_NR];
        mempool_t               decompress_workspace;
        ZSTD_parameters         zstd_params;
 
@@ -691,6 +742,8 @@ struct bch_fs {
 
        atomic64_t              key_version;
 
+       mempool_t               large_bkey_pool;
+
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
@@ -704,12 +757,16 @@ struct bch_fs {
        /* ERASURE CODING */
        struct list_head        ec_new_stripe_list;
        struct mutex            ec_new_stripe_lock;
+       u64                     ec_stripe_hint;
 
        struct bio_set          ec_bioset;
 
        struct work_struct      ec_stripe_delete_work;
        struct llist_head       ec_stripe_delete_list;
 
+       /* REFLINK */
+       u64                     reflink_hint;
+
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
        struct bio_set          dio_write_bioset;
@@ -747,6 +804,8 @@ struct bch_fs {
        mempool_t               btree_bounce_pool;
 
        struct journal          journal;
+       struct list_head        journal_entries;
+       struct journal_keys     journal_keys;
 
        u64                     last_bucket_seq_cleanup;
 
@@ -774,11 +833,6 @@ static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 #endif
 }
 
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
-       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
 static inline unsigned bucket_bytes(const struct bch_dev *ca)
 {
        return ca->mi.bucket_size << 9;
@@ -789,4 +843,32 @@ static inline unsigned block_bytes(const struct bch_fs *c)
        return c->opts.block_size << 9;
 }
 
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+       return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+       s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+       if (c->sb.time_precision == 1)
+               return ns;
+
+       return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+       struct timespec64 now;
+
+       ktime_get_coarse_real_ts64(&now);
+       return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+       return dev < c->sb.nr_devices && c->devs[dev];
+}
+
 #endif /* _BCACHEFS_H */