"btree node it traverses") \
BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
"Disables rewriting of btree nodes during mark and sweep")\
- BCH_DEBUG_PARAM(btree_gc_coalesce_disabled, \
- "Disables coalescing of btree nodes") \
BCH_DEBUG_PARAM(btree_shrinker_disabled, \
"Disables the shrinker callback for the btree node cache")
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
- BCH_DEBUG_PARAM(version_stress_test, \
- "Assigns random version numbers to newly written " \
- "extents, to test overlapping extent cases") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
#define BCH_TIME_STATS() \
BCH_TIME_STAT(btree_node_mem_alloc, sec, us) \
BCH_TIME_STAT(btree_gc, sec, ms) \
- BCH_TIME_STAT(btree_coalesce, sec, ms) \
BCH_TIME_STAT(btree_split, sec, us) \
BCH_TIME_STAT(btree_sort, ms, us) \
BCH_TIME_STAT(btree_read, ms, us) \
#include "alloc_types.h"
#include "buckets_types.h"
#include "clock_types.h"
-#include "io_types.h"
#include "journal_types.h"
#include "keylist_types.h"
-#include "move_types.h"
#include "super_types.h"
/* 256k, in sectors */
(btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES)
/* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2)
+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
struct btree;
struct crypto_blkcipher;
struct crypto_ahash;
enum gc_phase {
- GC_PHASE_SB_METADATA = BTREE_ID_NR + 1,
+ GC_PHASE_SB = BTREE_ID_NR + 1,
GC_PHASE_PENDING_DELETE,
+ GC_PHASE_ALLOC,
GC_PHASE_DONE
};
unsigned level;
};
-struct bch_member_cpu {
- u64 nbuckets; /* device size */
- u16 first_bucket; /* index of first bucket used */
- u16 bucket_size; /* sectors */
- u8 state;
- u8 tier;
- u8 has_metadata;
- u8 has_data;
- u8 replacement;
- u8 discard;
- u8 valid;
+struct io_count {
+ u64 sectors[2][BCH_DATA_NR];
};
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
+ struct completion ref_completion;
struct percpu_ref io_ref;
- struct completion stop_complete;
- struct completion offline_complete;
+ struct completion io_ref_completion;
struct bch_fs *fs;
uuid_le uuid;
char name[BDEVNAME_SIZE];
- struct bcache_superblock disk_sb;
+ struct bch_sb_handle disk_sb;
+ int sb_write_error;
- struct dev_group self;
+ struct bch_devs_mask self;
- /* biosets used in cloned bios for replicas and moving_gc */
+ /* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set;
struct task_struct *alloc_thread;
- struct prio_set *disk_buckets;
-
- /*
- * When allocating new buckets, prio_write() gets first dibs - since we
- * may not be allocate at all without writing priorities and gens.
- * prio_last_buckets[] contains the last buckets we wrote priorities to
- * (so gc can mark them as metadata).
- */
- u64 *prio_buckets;
- u64 *prio_last_buckets;
- spinlock_t prio_buckets_lock;
- struct bio *bio_prio;
- bool prio_read_done;
- bool need_prio_write;
- struct mutex prio_write_lock;
-
/*
* free: Buckets that are ready to be used
*
DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
spinlock_t freelist_lock;
+ unsigned nr_invalidated;
+ bool alloc_thread_started;
+
+ u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
+ unsigned open_buckets_partial_nr;
size_t fifo_last_bucket;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct bucket *buckets;
- unsigned short bucket_bits; /* ilog2(bucket_size) */
+ unsigned long *bucket_dirty;
/* last calculated minimum prio */
u16 min_prio[2];
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
+ size_t inc_gen_really_needs_gc;
+ u64 allocator_journal_seq_flush;
+ bool allocator_invalidating_data;
- bucket_heap alloc_heap;
- bucket_heap copygc_heap;
-
- /* Moving GC: */
- struct task_struct *moving_gc_read;
-
- struct bch_pd_controller moving_gc_pd;
-
- /* Tiering: */
- struct write_point tiering_write_point;
+ alloc_heap alloc_heap;
+ /* Copying GC: */
+ struct task_struct *copygc_thread;
+ copygc_heap copygc_heap;
+ struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point;
struct journal_device journal;
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
- atomic64_t meta_sectors_written;
- atomic64_t btree_sectors_written;
- u64 __percpu *sectors_written;
+ atomic_t latency[2];
+
+ struct io_count __percpu *io_done;
};
/*
* won't automatically reattach).
*/
enum {
+ BCH_FS_ALLOC_READ_DONE,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_FSCK_FIXED_ERRORS,
BCH_FS_FSCK_DONE,
BCH_FS_FIXED_GENS,
+ BCH_FS_REBUILD_REPLICAS,
};
struct btree_debug {
struct task_struct *migrate;
struct bch_pd_controller pd;
- struct dev_group devs;
+ struct bch_devs_mask devs;
+ struct write_point wp;
};
enum bch_fs_state {
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
+ struct bch_replicas_cpu __rcu *replicas;
+ struct bch_replicas_cpu __rcu *replicas_gc;
+ struct mutex replicas_gc_lock;
+
struct bch_opts opts;
/* Updated by bch2_sb_update():*/
uuid_le uuid;
uuid_le user_uuid;
- u16 block_size;
- u16 btree_node_size;
+ u16 encoded_extent_max;
u8 nr_devices;
u8 clean;
- u8 meta_replicas_have;
- u8 data_replicas_have;
-
- u8 str_hash_type;
u8 encryption_type;
u64 time_base_lo;
struct closure sb_write;
struct mutex sb_lock;
- struct backing_dev_info bdi;
-
/* BTREE CACHE */
struct bio_set btree_read_bio;
struct btree_root btree_roots[BTREE_ID_NR];
+ bool btree_roots_dirty;
struct mutex btree_root_lock;
- bool btree_cache_table_init_done;
- struct rhashtable btree_cache_table;
-
- /*
- * We never free a struct btree, except on shutdown - we just put it on
- * the btree_cache_freed list and reuse it later. This simplifies the
- * code, and it doesn't cost us much memory as the memory usage is
- * dominated by buffers that hold the actual btree node data and those
- * can be freed - and the number of struct btrees allocated is
- * effectively bounded.
- *
- * btree_cache_freeable effectively is a small cache - we use it because
- * high order page allocations can be rather expensive, and it's quite
- * common to delete and allocate btree nodes in quick succession. It
- * should never grow past ~2-3 nodes in practice.
- */
- struct mutex btree_cache_lock;
- struct list_head btree_cache;
- struct list_head btree_cache_freeable;
- struct list_head btree_cache_freed;
-
- /* Number of elements in btree_cache + btree_cache_freeable lists */
- unsigned btree_cache_used;
- unsigned btree_cache_reserve;
- struct shrinker btree_cache_shrink;
-
- /*
- * If we need to allocate memory for a new btree node and that
- * allocation fails, we can cannibalize another node in the btree cache
- * to satisfy the allocation - lock to guarantee only one thread does
- * this at a time:
- */
- struct closure_waitlist mca_wait;
- struct task_struct *btree_cache_alloc_lock;
+ struct btree_cache btree_cache;
mempool_t btree_reserve_pool;
* when allocating btree reserves fail halfway through) - instead, we
* can stick them here:
*/
- struct btree_alloc {
- struct open_bucket *ob;
- BKEY_PADDED(k);
- } btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
unsigned btree_reserve_cache_nr;
struct mutex btree_reserve_cache_lock;
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
- struct bch_pd_controller foreground_write_pd;
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
- spinlock_t foreground_write_pd_lock;
- struct bch_write_op *write_wait_head;
- struct bch_write_op *write_wait_tail;
- struct timer_list foreground_write_wakeup;
/*
* These contain all r/w devices - i.e. devices we can currently
* allocate from:
*/
- struct dev_group all_devs;
+ struct bch_devs_mask rw_devs[BCH_DATA_NR];
struct bch_tier tiers[BCH_TIER_MAX];
/* NULL if we only have devices in one tier: */
+ struct bch_devs_mask *fastest_devs;
struct bch_tier *fastest_tier;
u64 capacity; /* sectors */
struct io_clock io_clock[2];
- /* SECTOR ALLOCATOR */
- struct list_head open_buckets_open;
- struct list_head open_buckets_free;
- unsigned open_buckets_nr_free;
+ /* ALLOCATOR */
+ spinlock_t freelist_lock;
+ u8 open_buckets_freelist;
+ u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
- spinlock_t open_buckets_lock;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point write_points[WRITE_POINT_COUNT];
- struct write_point promote_write_point;
-
- /*
- * This write point is used for migrating data off a device
- * and can point to any other device.
- * We can't use the normal write points because those will
- * gang up n replicas, and for migration we want only one new
- * replica.
- */
- struct write_point migration_write_point;
+ struct hlist_head write_points_hash[WRITE_POINT_COUNT];
+ struct mutex write_points_hash_lock;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
+ unsigned long gc_count;
/*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
mempool_t compression_bounce[2];
struct crypto_shash *sha256;
- struct crypto_blkcipher *chacha20;
+ struct crypto_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
- struct bio_list read_retry_list;
- struct work_struct read_retry_work;
- spinlock_t read_retry_lock;
+ /* VFS IO PATH - fs-io.c */
+ struct bio_set writepage_bioset;
+ struct bio_set dio_write_bioset;
+ struct bio_set dio_read_bioset;
+
+ struct bio_list btree_write_error_list;
+ struct work_struct btree_write_error_work;
+ spinlock_t btree_write_error_lock;
/* ERRORS */
struct list_head fsck_errors;
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
+ atomic_long_t extent_migrate_done;
+ atomic_long_t extent_migrate_raced;
- unsigned foreground_write_ratelimit_enabled:1;
+ unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
unsigned tiering_enabled:1;
unsigned tiering_percent;
- /*
- * foreground writes will be throttled when the number of free
- * buckets is below this percentage
- */
- unsigned foreground_target_percent;
-
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
#undef BCH_TIME_STAT
};
-static inline bool bch2_fs_running(struct bch_fs *c)
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
- return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+#ifndef NO_BCACHEFS_FS
+ if (c->vfs_sb)
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
}
-static inline unsigned bucket_pages(const struct bch_dev *ca)
+static inline bool bch2_fs_running(struct bch_fs *c)
{
- return ca->mi.bucket_size / PAGE_SECTORS;
+ return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
}
static inline unsigned bucket_bytes(const struct bch_dev *ca)
static inline unsigned block_bytes(const struct bch_fs *c)
{
- return c->sb.block_size << 9;
+ return c->opts.block_size << 9;
}
#endif /* _BCACHEFS_H */