+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_H
#define _BCACHEFS_H
#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
+#include <linux/math64.h>
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+#define bch_warn_ratelimited(c, fmt, ...) \
+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
BCH_DEBUG_PARAM(expensive_debug_checks, \
"Enables various runtime debugging checks that " \
"significantly affect performance") \
+ BCH_DEBUG_PARAM(debug_check_iterators, \
+ "Enables extra verification for btree iterators") \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
"cached data") \
BCH_DEBUG_PARAM(force_reconstruct_read, \
"Force reads to use the reconstruct path, when reading" \
- "from erasure coded extents")
+ "from erasure coded extents") \
+ BCH_DEBUG_PARAM(test_restart_gc, \
+ "Test restarting mark and sweep gc when bucket gens change")
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
+ x(btree_node_split) \
+ x(btree_node_sort) \
+ x(btree_node_read) \
x(btree_gc) \
- x(btree_split) \
- x(btree_sort) \
- x(btree_read) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
x(data_promote) \
x(journal_write) \
x(journal_delay) \
- x(journal_blocked) \
- x(journal_flush_seq)
+ x(journal_flush_seq) \
+ x(blocked_journal) \
+ x(blocked_allocate) \
+ x(blocked_allocate_open_bucket)
enum bch_time_stats {
#define x(name) BCH_TIME_##name,
/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE BTREE_RESERVE_MAX
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
+
struct btree;
enum gc_phase {
GC_PHASE_BTREE_XATTRS,
GC_PHASE_BTREE_ALLOC,
GC_PHASE_BTREE_QUOTAS,
+ GC_PHASE_BTREE_REFLINK,
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
char name[BDEVNAME_SIZE];
struct bch_sb_handle disk_sb;
+ struct bch_sb *sb_read_scratch;
int sb_write_error;
struct bch_devs_mask self;
*/
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_nouse;
- unsigned long *buckets_written;
- /* most out of date gen in the btree */
- u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
*/
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
- spinlock_t freelist_lock;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
- bool allocator_blocked;
+
+ /*
+ * XXX: this should be an enum for allocator state, so as to include
+ * error state
+ */
+ enum {
+ ALLOCATOR_STOPPED,
+ ALLOCATOR_RUNNING,
+ ALLOCATOR_BLOCKED,
+ ALLOCATOR_BLOCKED_FULL,
+ } allocator_state;
alloc_heap alloc_heap;
struct io_count __percpu *io_done;
};
-/*
- * Flag bits for what phase of startup/shutdown the cache set is at, how we're
- * shutting down, etc.:
- *
- * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
- * all the backing devices first (their cached data gets invalidated, and they
- * won't automatically reattach).
- */
enum {
/* startup: */
BCH_FS_ALLOC_READ_DONE,
+ BCH_FS_ALLOC_CLEAN,
BCH_FS_ALLOCATOR_STARTED,
+ BCH_FS_ALLOCATOR_RUNNING,
+ BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
+ BCH_FS_RW,
/* shutdown: */
+ BCH_FS_STOPPING,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
/* errors: */
BCH_FS_ERROR,
+ BCH_FS_ERRORS_FIXED,
/* misc: */
BCH_FS_BDEV_MOUNTED,
- BCH_FS_FSCK_FIXED_ERRORS,
- BCH_FS_FSCK_UNFIXED_ERRORS,
BCH_FS_FIXED_GENS,
+ BCH_FS_ALLOC_WRITTEN,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
struct dentry *failed;
};
-enum bch_fs_state {
- BCH_FS_STARTING = 0,
- BCH_FS_STOPPING,
- BCH_FS_RO,
- BCH_FS_RW,
-};
-
struct bch_fs_pcpu {
u64 sectors_available;
};
+struct journal_seq_blacklist_table {
+ size_t nr;
+ struct journal_seq_blacklist_table_entry {
+ u64 start;
+ u64 end;
+ bool dirty;
+ } entries[0];
+};
+
+struct journal_keys {
+ struct journal_key {
+ enum btree_id btree_id:8;
+ unsigned level:8;
+ struct bkey_i *k;
+ u32 journal_seq;
+ u32 journal_offset;
+ } *d;
+ size_t nr;
+ u64 journal_seq_base;
+};
+
struct bch_fs {
struct closure cl;
/* ro/rw, add/remove devices: */
struct mutex state_lock;
- enum bch_fs_state state;
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
struct bch_replicas_cpu replicas_gc;
struct mutex replicas_gc_lock;
+ struct journal_entry_res replicas_journal_res;
+
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
u32 time_base_hi;
u32 time_precision;
u64 features;
+ u64 compat;
} sb;
struct bch_sb_handle disk_sb;
struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
- bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
- mempool_t btree_reserve_pool;
-
/*
* Cache of allocated btree nodes - if we allocate a btree node and
* don't use it, if we free it that space can't be reused until going
mempool_t btree_interior_update_pool;
struct list_head btree_interior_update_list;
+ struct list_head btree_interior_updates_unwritten;
struct mutex btree_interior_update_lock;
struct closure_waitlist btree_interior_update_wait;
+ struct workqueue_struct *btree_interior_update_worker;
+ struct work_struct btree_interior_update_work;
+
+ /* btree_iter.c: */
+ struct mutex btree_trans_lock;
+ struct list_head btree_trans_list;
mempool_t btree_iters_pool;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
+ struct workqueue_struct *journal_reclaim_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
struct percpu_rw_semaphore mark_lock;
+ seqcount_t usage_lock;
+ struct bch_fs_usage *usage_base;
struct bch_fs_usage __percpu *usage[2];
- struct bch_fs_usage __percpu *usage_scratch;
+ struct bch_fs_usage __percpu *usage_gc;
+
+ /* single element mempool: */
+ struct mutex usage_scratch_lock;
+ struct bch_fs_usage *usage_scratch;
/*
* When we invalidate buckets, we use both the priority and the amount
struct io_clock io_clock[2];
+ /* JOURNAL SEQ BLACKLIST */
+ struct journal_seq_blacklist_table *
+ journal_seq_blacklist_table;
+ struct work_struct journal_seq_blacklist_gc_work;
+
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;
+ u64 blocked_allocate;
+ u64 blocked_allocate_open_bucket;
u8 open_buckets_freelist;
u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
struct rhashtable promote_table;
mempool_t compression_bounce[2];
- mempool_t compress_workspace[BCH_COMPRESSION_NR];
+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
mempool_t decompress_workspace;
ZSTD_parameters zstd_params;
atomic64_t key_version;
+ mempool_t large_bkey_pool;
+
/* REBALANCE */
struct bch_fs_rebalance rebalance;
/* ERASURE CODING */
struct list_head ec_new_stripe_list;
struct mutex ec_new_stripe_lock;
+ u64 ec_stripe_hint;
struct bio_set ec_bioset;
struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list;
+ /* REFLINK */
+ u64 reflink_hint;
+
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
mempool_t btree_bounce_pool;
struct journal journal;
+ struct list_head journal_entries;
+ struct journal_keys journal_keys;
u64 last_bucket_seq_cleanup;
#endif
}
-static inline bool bch2_fs_running(struct bch_fs *c)
-{
- return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
-}
-
static inline unsigned bucket_bytes(const struct bch_dev *ca)
{
return ca->mi.bucket_size << 9;
return c->opts.block_size << 9;
}
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
+{
+ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
+}
+
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+{
+ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
+
+ if (c->sb.time_precision == 1)
+ return ns;
+
+ return div_s64(ns, c->sb.time_precision);
+}
+
+static inline s64 bch2_current_time(struct bch_fs *c)
+{
+ struct timespec64 now;
+
+ ktime_get_coarse_real_ts64(&now);
+ return timespec_to_bch2_time(c, now);
+}
+
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
+{
+ return dev < c->sb.nr_devices && c->devs[dev];
+}
+
#endif /* _BCACHEFS_H */