BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
- BCH_DEBUG_PARAM(version_stress_test, \
- "Assigns random version numbers to newly written " \
- "extents, to test overlapping extent cases") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
#include "alloc_types.h"
#include "buckets_types.h"
#include "clock_types.h"
-#include "io_types.h"
#include "journal_types.h"
#include "keylist_types.h"
-#include "move_types.h"
+#include "quota_types.h"
#include "super_types.h"
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX 512
-
/*
* Number of nodes we might have to allocate in a worst case btree split
* operation - we split all the way up to the root, then allocate a new root.
struct crypto_ahash;
enum gc_phase {
- GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
+ GC_PHASE_SB = BTREE_ID_NR + 1,
GC_PHASE_PENDING_DELETE,
+ GC_PHASE_ALLOC,
GC_PHASE_DONE
};
unsigned level;
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u8 state;
- u8 tier;
- u8 replacement;
- u8 discard;
- u8 valid;
-};
-
-struct bch_replicas_cpu_entry {
- u8 data_type;
- u8 devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
- struct rcu_head rcu;
- unsigned nr;
- unsigned entry_size;
- struct bch_replicas_cpu_entry entries[];
+struct io_count {
+ u64 sectors[2][BCH_DATA_NR];
};
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
+ struct completion ref_completion;
struct percpu_ref io_ref;
- struct completion stop_complete;
- struct completion offline_complete;
+ struct completion io_ref_completion;
struct bch_fs *fs;
uuid_le uuid;
char name[BDEVNAME_SIZE];
- struct bcache_superblock disk_sb;
+ struct bch_sb_handle disk_sb;
+ int sb_write_error;
- struct dev_group self;
+ struct bch_devs_mask self;
- /* biosets used in cloned bios for replicas and moving_gc */
+ /* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set;
- struct task_struct *alloc_thread;
+ /*
+ * Buckets:
+ * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+ * gc_lock, for device resize - holding any is sufficient for access:
+ * Or rcu_read_lock(), but only for ptr_stale():
+ */
+ struct bucket_array __rcu *buckets;
+ unsigned long *buckets_dirty;
+ /* most out of date gen in the btree */
+ u8 *oldest_gens;
+ struct rw_semaphore bucket_lock;
- bool need_alloc_write;
+ struct bch_dev_usage __percpu *usage_percpu;
+ struct bch_dev_usage usage_cached;
+
+ /* Allocator: */
+ struct task_struct *alloc_thread;
/*
* free: Buckets that are ready to be used
* gens/prios, they'll be moved to the free list (and possibly discarded
* in the process)
*/
- DECLARE_FIFO(long, free)[RESERVE_NR];
- DECLARE_FIFO(long, free_inc);
+ alloc_fifo free[RESERVE_NR];
+ alloc_fifo free_inc;
spinlock_t freelist_lock;
- bool alloc_thread_started;
-
- size_t fifo_last_bucket;
+ unsigned nr_invalidated;
- /* Allocation stuff: */
+ u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
+ unsigned open_buckets_partial_nr;
- /* most out of date gen in the btree */
- u8 *oldest_gens;
- struct bucket *buckets;
- unsigned short bucket_bits; /* ilog2(bucket_size) */
+ size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 min_prio[2];
- /*
- * Bucket book keeping. The first element is updated by GC, the
- * second contains a saved copy of the stats from the beginning
- * of GC.
- */
- struct bch_dev_usage __percpu *usage_percpu;
- struct bch_dev_usage usage_cached;
-
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
bool allocator_invalidating_data;
alloc_heap alloc_heap;
- bucket_heap copygc_heap;
-
- /* Moving GC: */
- struct task_struct *moving_gc_read;
-
- struct bch_pd_controller moving_gc_pd;
-
- /* Tiering: */
- struct write_point tiering_write_point;
+ /* Copying GC: */
+ struct task_struct *copygc_thread;
+ copygc_heap copygc_heap;
+ struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point;
struct journal_device journal;
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
- atomic64_t meta_sectors_written;
- atomic64_t btree_sectors_written;
- u64 __percpu *sectors_written;
+ atomic_t latency[2];
+
+ struct io_count __percpu *io_done;
};
/*
* won't automatically reattach).
*/
enum {
+ /* startup: */
+ BCH_FS_BRAND_NEW_FS,
+ BCH_FS_ALLOC_READ_DONE,
+ BCH_FS_ALLOCATOR_STARTED,
BCH_FS_INITIAL_GC_DONE,
+ BCH_FS_FSCK_DONE,
+
+ /* shutdown: */
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_GC_STOPPING,
+
+ /* errors: */
+ BCH_FS_ERROR,
BCH_FS_GC_FAILURE,
+
+ /* misc: */
BCH_FS_BDEV_MOUNTED,
- BCH_FS_ERROR,
BCH_FS_FSCK_FIXED_ERRORS,
- BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
BCH_FS_REBUILD_REPLICAS,
+ BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
struct task_struct *migrate;
struct bch_pd_controller pd;
- struct dev_group devs;
+ struct bch_devs_mask devs;
+ struct write_point wp;
};
enum bch_fs_state {
uuid_le uuid;
uuid_le user_uuid;
- u16 block_size;
- u16 btree_node_size;
+ u16 encoded_extent_max;
u8 nr_devices;
u8 clean;
- u8 str_hash_type;
u8 encryption_type;
u64 time_base_lo;
struct closure sb_write;
struct mutex sb_lock;
- struct backing_dev_info bdi;
-
/* BTREE CACHE */
- struct bio_set btree_read_bio;
+ struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
+ bool btree_roots_dirty;
struct mutex btree_root_lock;
- bool btree_cache_table_init_done;
- struct rhashtable btree_cache_table;
-
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex btree_cache_lock;
- struct list_head btree_cache;
- struct list_head btree_cache_freeable;
- struct list_head btree_cache_freed;
-
- /* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned btree_cache_used;
- unsigned btree_cache_reserve;
- struct shrinker btree_cache_shrink;
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct closure_waitlist mca_wait;
- struct task_struct *btree_cache_alloc_lock;
+ struct btree_cache btree_cache;
mempool_t btree_reserve_pool;
* when allocating btree reserves fail halfway through) - instead, we
* can stick them here:
*/
- struct btree_alloc {
- struct open_bucket *ob;
- BKEY_PADDED(k);
- } btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
unsigned btree_reserve_cache_nr;
struct mutex btree_reserve_cache_lock;
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
- struct bch_pd_controller foreground_write_pd;
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
- spinlock_t foreground_write_pd_lock;
- struct bch_write_op *write_wait_head;
- struct bch_write_op *write_wait_tail;
- struct timer_list foreground_write_wakeup;
/*
* These contain all r/w devices - i.e. devices we can currently
* allocate from:
*/
- struct dev_group all_devs;
+ struct bch_devs_mask rw_devs[BCH_DATA_NR];
struct bch_tier tiers[BCH_TIER_MAX];
/* NULL if we only have devices in one tier: */
+ struct bch_devs_mask *fastest_devs;
struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
struct bch_fs_usage usage_cached;
struct lglock usage_lock;
- struct mutex bucket_lock;
-
struct closure_waitlist freelist_wait;
/*
struct io_clock io_clock[2];
- /* SECTOR ALLOCATOR */
- struct list_head open_buckets_open;
- struct list_head open_buckets_free;
- unsigned open_buckets_nr_free;
+ /* ALLOCATOR */
+ spinlock_t freelist_lock;
+ u8 open_buckets_freelist;
+ u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
- spinlock_t open_buckets_lock;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point write_points[WRITE_POINT_COUNT];
- struct write_point promote_write_point;
-
- /*
- * This write point is used for migrating data off a device
- * and can point to any other device.
- * We can't use the normal write points because those will
- * gang up n replicas, and for migration we want only one new
- * replica.
- */
- struct write_point migration_write_point;
+ struct hlist_head write_points_hash[WRITE_POINT_COUNT];
+ struct mutex write_points_hash_lock;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic64_t key_version;
- struct bio_list read_retry_list;
- struct work_struct read_retry_work;
- spinlock_t read_retry_lock;
+ /* VFS IO PATH - fs-io.c */
+ struct bio_set writepage_bioset;
+ struct bio_set dio_write_bioset;
+ struct bio_set dio_read_bioset;
+
+ struct bio_list btree_write_error_list;
+ struct work_struct btree_write_error_work;
+ spinlock_t btree_write_error_lock;
/* ERRORS */
struct list_head fsck_errors;
unsigned writeback_pages_max;
atomic_long_t nr_inodes;
+ /* QUOTAS */
+ struct bch_memquota_type quotas[QTYP_NR];
+
/* DEBUG JUNK */
struct dentry *debug;
struct btree_debug btree_debug[BTREE_ID_NR];
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
+ atomic_long_t extent_migrate_done;
+ atomic_long_t extent_migrate_raced;
unsigned btree_gc_periodic:1;
- unsigned foreground_write_ratelimit_enabled:1;
unsigned copy_gc_enabled:1;
unsigned tiering_enabled:1;
unsigned tiering_percent;
- /*
- * foreground writes will be throttled when the number of free
- * buckets is below this percentage
- */
- unsigned foreground_target_percent;
-
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
#undef BCH_TIME_STAT
};
-static inline bool bch2_fs_running(struct bch_fs *c)
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
- return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+#ifndef NO_BCACHEFS_FS
+ if (c->vfs_sb)
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
}
-static inline unsigned bucket_pages(const struct bch_dev *ca)
+static inline bool bch2_fs_running(struct bch_fs *c)
{
- return ca->mi.bucket_size / PAGE_SECTORS;
+ return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
}
static inline unsigned bucket_bytes(const struct bch_dev *ca)
static inline unsigned block_bytes(const struct bch_fs *c)
{
- return c->sb.block_size << 9;
+ return c->opts.block_size << 9;
}
#endif /* _BCACHEFS_H */