]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/bcachefs.h
Update bcachefs sources to 39a84c99af2d bcachefs: Clamp replicas_required to replicas
[bcachefs-tools-debian] / libbcachefs / bcachefs.h
index 295efeda12ff8eaaf43e6c88aa9aabea138fb548..3b48c5e133b5b51de31e9f8cab7413d21fd11286 100644 (file)
 #include <linux/mutex.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
 #include <linux/rhashtable.h>
 #include <linux/rwsem.h>
 #include <linux/semaphore.h>
 #include <linux/seqlock.h>
 #include <linux/shrinker.h>
 #include <linux/srcu.h>
+#include <linux/thread_with_file_types.h>
+#include <linux/time_stats.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <linux/zstd.h>
 
 #define race_fault(...)                        dynamic_fault("bcachefs:race")
 
+#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
+
 #define trace_and_count(_c, _name, ...)                                        \
 do {                                                                   \
-       this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]);              \
+       count_event(_c, _name);                                         \
        trace_##_name(__VA_ARGS__);                                     \
 } while (0)
 
@@ -262,46 +267,76 @@ do {                                                                      \
 
 #define bch2_fmt(_c, fmt)              bch2_log_msg(_c, fmt "\n")
 
+__printf(2, 3)
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c)    _Generic((_c),                          \
+       struct bch_dev *:       ((struct bch_dev *) (_c))->fs,          \
+       struct bch_fs *:        (_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...)                                        \
+do {                                                                   \
+       static DEFINE_RATELIMIT_STATE(_rs,                              \
+                                     DEFAULT_RATELIMIT_INTERVAL,       \
+                                     DEFAULT_RATELIMIT_BURST);         \
+                                                                       \
+       if (__ratelimit(&_rs))                                          \
+               bch2_print(_c, __VA_ARGS__);                            \
+} while (0)
+
 #define bch_info(c, fmt, ...) \
-       printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_notice(c, fmt, ...) \
-       printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn(c, fmt, ...) \
-       printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_warn_ratelimited(c, fmt, ...) \
-       printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
 
 #define bch_err(c, fmt, ...) \
-       printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err_dev(ca, fmt, ...) \
-       printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
 #define bch_err_dev_offset(ca, _offset, fmt, ...) \
-       printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
 #define bch_err_inum(c, _inum, fmt, ...) \
-       printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
 #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
-       printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+       bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
 #define bch_err_ratelimited(c, fmt, ...) \
-       printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
 #define bch_err_dev_ratelimited(ca, fmt, ...) \
-       printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
 #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
-       printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
 #define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
-       printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
 #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
-       printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+       bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+static inline bool should_print_err(int err)
+{
+       return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
+}
 
 #define bch_err_fn(_c, _ret)                                           \
 do {                                                                   \
-       if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+       if (should_print_err(_ret))                                     \
                bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
 } while (0)
 
+#define bch_err_fn_ratelimited(_c, _ret)                               \
+do {                                                                   \
+       if (should_print_err(_ret))                                     \
+               bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
 #define bch_err_msg(_c, _ret, _msg, ...)                               \
 do {                                                                   \
-       if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+       if (should_print_err(_ret))                                     \
                bch_err(_c, "%s(): error " _msg " %s", __func__,        \
                        ##__VA_ARGS__, bch2_err_str(_ret));             \
 } while (0)
@@ -392,6 +427,7 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(btree_node_merge)                     \
        x(btree_node_sort)                      \
        x(btree_node_read)                      \
+       x(btree_node_read_done)                 \
        x(btree_interior_update_foreground)     \
        x(btree_interior_update_total)          \
        x(btree_gc)                             \
@@ -469,6 +505,7 @@ enum gc_phase {
        GC_PHASE_BTREE_deleted_inodes,
        GC_PHASE_BTREE_logged_ops,
        GC_PHASE_BTREE_rebalance_work,
+       GC_PHASE_BTREE_subvolume_children,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -558,7 +595,7 @@ struct bch_dev {
 
        /* The rest of this all shows up in sysfs */
        atomic64_t              cur_latency[2];
-       struct bch2_time_stats  io_latency[2];
+       struct time_stats_quantiles     io_latency[2];
 
 #define CONGESTED_MAX          1024
        atomic_t                congested;
@@ -567,32 +604,35 @@ struct bch_dev {
        struct io_count __percpu *io_done;
 };
 
-enum {
-       /* startup: */
-       BCH_FS_STARTED,
-       BCH_FS_MAY_GO_RW,
-       BCH_FS_RW,
-       BCH_FS_WAS_RW,
-
-       /* shutdown: */
-       BCH_FS_STOPPING,
-       BCH_FS_EMERGENCY_RO,
-       BCH_FS_GOING_RO,
-       BCH_FS_WRITE_DISABLE_COMPLETE,
-       BCH_FS_CLEAN_SHUTDOWN,
-
-       /* fsck passes: */
-       BCH_FS_FSCK_DONE,
-       BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
-       BCH_FS_NEED_ANOTHER_GC,
-
-       BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
-
-       /* errors: */
-       BCH_FS_ERROR,
-       BCH_FS_TOPOLOGY_ERROR,
-       BCH_FS_ERRORS_FIXED,
-       BCH_FS_ERRORS_NOT_FIXED,
+/*
+ * initial_gc_unfixed
+ * error
+ * topology error
+ */
+
+#define BCH_FS_FLAGS()                 \
+       x(started)                      \
+       x(may_go_rw)                    \
+       x(rw)                           \
+       x(was_rw)                       \
+       x(stopping)                     \
+       x(emergency_ro)                 \
+       x(going_ro)                     \
+       x(write_disable_complete)       \
+       x(clean_shutdown)               \
+       x(fsck_running)                 \
+       x(initial_gc_unfixed)           \
+       x(need_another_gc)              \
+       x(need_delete_dead_snapshots)   \
+       x(error)                        \
+       x(topology_error)               \
+       x(errors_fixed)                 \
+       x(errors_not_fixed)
+
+enum bch_fs_flags {
+#define x(n)           BCH_FS_##n,
+       BCH_FS_FLAGS()
+#undef x
 };
 
 struct btree_debug {
@@ -602,10 +642,11 @@ struct btree_debug {
 #define BCH_TRANSACTIONS_NR 128
 
 struct btree_transaction_stats {
-       struct bch2_time_stats  lock_hold_times;
+       struct time_stats       duration;
+       struct time_stats       lock_hold_times;
        struct mutex            lock;
        unsigned                nr_max_paths;
-       unsigned                wb_updates_size;
+       unsigned                journal_entries_size;
        unsigned                max_mem;
        char                    *max_paths_text;
 };
@@ -693,6 +734,8 @@ struct bch_fs {
        struct super_block      *vfs_sb;
        dev_t                   dev;
        char                    name[40];
+       struct stdio_redirect   *stdio;
+       struct task_struct      *stdio_filter;
 
        /* ro/rw, add/remove/resize devices: */
        struct rw_semaphore     state_lock;
@@ -703,6 +746,13 @@ struct bch_fs {
 #else
        struct percpu_ref       writes;
 #endif
+       /*
+        * Analagous to c->writes, for asynchronous ops that don't necessarily
+        * need fs to be read-write
+        */
+       refcount_t              ro_ref;
+       wait_queue_head_t       ro_ref_wait;
+
        struct work_struct      read_only_work;
 
        struct bch_dev __rcu    *devs[BCH_SB_MEMBERS_MAX];
@@ -741,6 +791,7 @@ struct bch_fs {
                unsigned        nsec_per_time_unit;
                u64             features;
                u64             compat;
+               unsigned long   errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
        }                       sb;
 
 
@@ -870,8 +921,6 @@ struct bch_fs {
        /* ALLOCATOR */
        spinlock_t              freelist_lock;
        struct closure_waitlist freelist_wait;
-       u64                     blocked_allocate;
-       u64                     blocked_allocate_open_bucket;
 
        open_bucket_idx_t       open_buckets_freelist;
        open_bucket_idx_t       open_buckets_nr_free;
@@ -1005,10 +1054,21 @@ struct bch_fs {
        /* RECOVERY */
        u64                     journal_replay_seq_start;
        u64                     journal_replay_seq_end;
+       /*
+        * Two different uses:
+        * "Has this fsck pass?" - i.e. should this type of error be an
+        * emergency read-only
+        * And, in certain situations fsck will rewind to an earlier pass: used
+        * for signaling to the toplevel code which pass we want to run now.
+        */
        enum bch_recovery_pass  curr_recovery_pass;
        /* bitmap of explicitly enabled recovery passes: */
        u64                     recovery_passes_explicit;
+       /* bitmask of recovery passes that we actually ran */
        u64                     recovery_passes_complete;
+       /* never rewinds version of curr_recovery_pass */
+       enum bch_recovery_pass  recovery_pass_done;
+       struct semaphore        online_fsck_mutex;
 
        /* DEBUG JUNK */
        struct dentry           *fs_debug_dir;
@@ -1044,7 +1104,7 @@ struct bch_fs {
        unsigned                copy_gc_enabled:1;
        bool                    promote_whole_extents;
 
-       struct bch2_time_stats  times[BCH_TIME_STAT_NR];
+       struct time_stats       times[BCH_TIME_STAT_NR];
 
        struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
 
@@ -1071,7 +1131,7 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
 static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
 {
 #ifdef BCH_WRITE_REF_DEBUG
-       return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+       return !test_bit(BCH_FS_going_ro, &c->flags) &&
                atomic_long_inc_not_zero(&c->writes[ref]);
 #else
        return percpu_ref_tryget(&c->writes);
@@ -1081,7 +1141,7 @@ static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref
 static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
 {
 #ifdef BCH_WRITE_REF_DEBUG
-       return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+       return !test_bit(BCH_FS_going_ro, &c->flags) &&
                atomic_long_inc_not_zero(&c->writes[ref]);
 #else
        return percpu_ref_tryget_live(&c->writes);
@@ -1100,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
                if (atomic_long_read(&c->writes[i]))
                        return;
 
-       set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+       set_bit(BCH_FS_write_disable_complete, &c->flags);
        wake_up(&bch2_read_only_wait);
 #else
        percpu_ref_put(&c->writes);
 #endif
 }
 
+static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
+{
+       if (test_bit(BCH_FS_stopping, &c->flags))
+               return false;
+
+       return refcount_inc_not_zero(&c->ro_ref);
+}
+
+static inline void bch2_ro_ref_put(struct bch_fs *c)
+{
+       if (refcount_dec_and_test(&c->ro_ref))
+               wake_up(&c->ro_ref_wait);
+}
+
 static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
 {
 #ifndef NO_BCACHEFS_FS
@@ -1130,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
        return c->opts.block_size >> 9;
 }
 
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
-       return c->opts.btree_node_size >> 9;
-}
-
 static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
 {
        return c->btree_key_cache_btrees & (1U << btree);
@@ -1171,6 +1240,27 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
        return dev < c->sb.nr_devices && c->devs[dev];
 }
 
+static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
+{
+       struct stdio_redirect *stdio = c->stdio;
+
+       if (c->stdio_filter && c->stdio_filter != current)
+               stdio = NULL;
+       return stdio;
+}
+
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+       return min(c->opts.metadata_replicas,
+                  c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+       return min(c->opts.data_replicas,
+                  c->opts.data_replicas_required);
+}
+
 #define BKEY_PADDED_ONSTACK(key, pad)                          \
        struct { struct bkey_i key; __u64 key ## _pad[pad]; }