X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=sidebyside;f=libbcachefs%2Fbcachefs.h;h=78c427fa17a63bb7c42706751afa7954e7c2e6ca;hb=4de98a2712764bceb9e0f67b1ac2f2c7862feb77;hp=ab99af7b3a24d075731b805e14de32e60d471daf;hpb=63065c01285601afbe2457e92729efc11581e37d;p=bcachefs-tools-debian diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index ab99af7..78c427f 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -251,9 +251,6 @@ do { \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ - BCH_DEBUG_PARAM(version_stress_test, \ - "Assigns random version numbers to newly written " \ - "extents, to test overlapping extent cases") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -282,15 +279,11 @@ do { \ #include "alloc_types.h" #include "buckets_types.h" #include "clock_types.h" -#include "io_types.h" #include "journal_types.h" #include "keylist_types.h" -#include "move_types.h" +#include "quota_types.h" #include "super_types.h" -/* 256k, in sectors */ -#define BTREE_NODE_SIZE_MAX 512 - /* * Number of nodes we might have to allocate in a worst case btree split * operation - we split all the way up to the root, then allocate a new root. @@ -312,8 +305,9 @@ struct crypto_blkcipher; struct crypto_ahash; enum gc_phase { - GC_PHASE_SB_METADATA = BTREE_ID_NR + 1, + GC_PHASE_SB = BTREE_ID_NR + 1, GC_PHASE_PENDING_DELETE, + GC_PHASE_ALLOC, GC_PHASE_DONE }; @@ -323,35 +317,16 @@ struct gc_pos { unsigned level; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u8 state; - u8 tier; - u8 replacement; - u8 discard; - u8 valid; -}; - -struct bch_replicas_cpu_entry { - u8 data_type; - u8 devs[BCH_SB_MEMBERS_MAX / 8]; -}; - -struct bch_replicas_cpu { - struct rcu_head rcu; - unsigned nr; - unsigned entry_size; - struct bch_replicas_cpu_entry entries[]; +struct io_count { + u64 sectors[2][BCH_DATA_NR]; }; struct bch_dev { struct kobject kobj; struct percpu_ref ref; + struct completion ref_completion; struct percpu_ref io_ref; - struct completion stop_complete; - struct completion offline_complete; + struct completion io_ref_completion; struct bch_fs *fs; @@ -364,16 +339,31 @@ struct bch_dev { uuid_le uuid; char name[BDEVNAME_SIZE]; - struct bcache_superblock disk_sb; + struct bch_sb_handle disk_sb; + int sb_write_error; - struct dev_group self; + struct bch_devs_mask self; - /* biosets used in cloned bios for replicas and moving_gc */ + /* biosets used in cloned bios for writing multiple replicas */ struct bio_set replica_set; - struct task_struct *alloc_thread; + /* + * Buckets: + * Per-bucket arrays are protected by c->usage_lock, bucket_lock and + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets; + unsigned long *buckets_dirty; + /* most out of date gen in the btree */ + u8 *oldest_gens; + struct rw_semaphore bucket_lock; - bool need_alloc_write; + struct bch_dev_usage __percpu *usage_percpu; + struct bch_dev_usage usage_cached; + + /* Allocator: */ + struct task_struct *alloc_thread; /* * free: Buckets that are ready to be used @@ -384,31 +374,19 @@ struct bch_dev { * gens/prios, they'll be moved to the free list (and possibly discarded * in the process) */ - DECLARE_FIFO(long, free)[RESERVE_NR]; - DECLARE_FIFO(long, free_inc); + alloc_fifo free[RESERVE_NR]; + alloc_fifo free_inc; spinlock_t freelist_lock; - bool alloc_thread_started; - - size_t fifo_last_bucket; + unsigned nr_invalidated; - /* Allocation stuff: */ + u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; + unsigned open_buckets_partial_nr; - /* most out of date gen in the btree */ - u8 *oldest_gens; - struct bucket *buckets; - unsigned short bucket_bits; /* ilog2(bucket_size) */ + size_t fifo_last_bucket; /* last calculated minimum prio */ u16 min_prio[2]; - /* - * Bucket book keeping. The first element is updated by GC, the - * second contains a saved copy of the stats from the beginning - * of GC. - */ - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; - atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -416,16 +394,11 @@ struct bch_dev { bool allocator_invalidating_data; alloc_heap alloc_heap; - bucket_heap copygc_heap; - - /* Moving GC: */ - struct task_struct *moving_gc_read; - - struct bch_pd_controller moving_gc_pd; - - /* Tiering: */ - struct write_point tiering_write_point; + /* Copying GC: */ + struct task_struct *copygc_thread; + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; struct write_point copygc_write_point; struct journal_device journal; @@ -433,9 +406,9 @@ struct bch_dev { struct work_struct io_error_work; /* The rest of this all shows up in sysfs */ - atomic64_t meta_sectors_written; - atomic64_t btree_sectors_written; - u64 __percpu *sectors_written; + atomic_t latency[2]; + + struct io_count __percpu *io_done; }; /* @@ -447,17 +420,28 @@ struct bch_dev { * won't automatically reattach). */ enum { + /* startup: */ + BCH_FS_BRAND_NEW_FS, + BCH_FS_ALLOC_READ_DONE, + BCH_FS_ALLOCATOR_STARTED, BCH_FS_INITIAL_GC_DONE, + BCH_FS_FSCK_DONE, + + /* shutdown: */ BCH_FS_EMERGENCY_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_GC_STOPPING, + + /* errors: */ + BCH_FS_ERROR, BCH_FS_GC_FAILURE, + + /* misc: */ BCH_FS_BDEV_MOUNTED, - BCH_FS_ERROR, BCH_FS_FSCK_FIXED_ERRORS, - BCH_FS_FSCK_DONE, BCH_FS_FIXED_GENS, BCH_FS_REBUILD_REPLICAS, + BCH_FS_HOLD_BTREE_WRITES, }; struct btree_debug { @@ -472,7 +456,8 @@ struct bch_tier { struct task_struct *migrate; struct bch_pd_controller pd; - struct dev_group devs; + struct bch_devs_mask devs; + struct write_point wp; }; enum bch_fs_state { @@ -518,13 +503,11 @@ struct bch_fs { uuid_le uuid; uuid_le user_uuid; - u16 block_size; - u16 btree_node_size; + u16 encoded_extent_max; u8 nr_devices; u8 clean; - u8 str_hash_type; u8 encryption_type; u64 time_base_lo; @@ -540,48 +523,14 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; - struct backing_dev_info bdi; - /* BTREE CACHE */ - struct bio_set btree_read_bio; + struct bio_set btree_bio; struct btree_root btree_roots[BTREE_ID_NR]; + bool btree_roots_dirty; struct mutex btree_root_lock; - bool btree_cache_table_init_done; - struct rhashtable btree_cache_table; - - /* - * We never free a struct btree, except on shutdown - we just put it on - * the btree_cache_freed list and reuse it later. This simplifies the - * code, and it doesn't cost us much memory as the memory usage is - * dominated by buffers that hold the actual btree node data and those - * can be freed - and the number of struct btrees allocated is - * effectively bounded. - * - * btree_cache_freeable effectively is a small cache - we use it because - * high order page allocations can be rather expensive, and it's quite - * common to delete and allocate btree nodes in quick succession. It - * should never grow past ~2-3 nodes in practice. - */ - struct mutex btree_cache_lock; - struct list_head btree_cache; - struct list_head btree_cache_freeable; - struct list_head btree_cache_freed; - - /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned btree_cache_used; - unsigned btree_cache_reserve; - struct shrinker btree_cache_shrink; - - /* - * If we need to allocate memory for a new btree node and that - * allocation fails, we can cannibalize another node in the btree cache - * to satisfy the allocation - lock to guarantee only one thread does - * this at a time: - */ - struct closure_waitlist mca_wait; - struct task_struct *btree_cache_alloc_lock; + struct btree_cache btree_cache; mempool_t btree_reserve_pool; @@ -592,10 +541,7 @@ struct bch_fs { * when allocating btree reserves fail halfway through) - instead, we * can stick them here: */ - struct btree_alloc { - struct open_bucket *ob; - BKEY_PADDED(k); - } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; unsigned btree_reserve_cache_nr; struct mutex btree_reserve_cache_lock; @@ -608,22 +554,18 @@ struct bch_fs { struct workqueue_struct *copygc_wq; /* ALLOCATION */ - struct bch_pd_controller foreground_write_pd; struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; - spinlock_t foreground_write_pd_lock; - struct bch_write_op *write_wait_head; - struct bch_write_op *write_wait_tail; - struct timer_list foreground_write_wakeup; /* * These contain all r/w devices - i.e. devices we can currently * allocate from: */ - struct dev_group all_devs; + struct bch_devs_mask rw_devs[BCH_DATA_NR]; struct bch_tier tiers[BCH_TIER_MAX]; /* NULL if we only have devices in one tier: */ + struct bch_devs_mask *fastest_devs; struct bch_tier *fastest_tier; u64 capacity; /* sectors */ @@ -641,8 +583,6 @@ struct bch_fs { struct bch_fs_usage usage_cached; struct lglock usage_lock; - struct mutex bucket_lock; - struct closure_waitlist freelist_wait; /* @@ -655,27 +595,18 @@ struct bch_fs { struct io_clock io_clock[2]; - /* SECTOR ALLOCATOR */ - struct list_head open_buckets_open; - struct list_head open_buckets_free; - unsigned open_buckets_nr_free; + /* ALLOCATOR */ + spinlock_t freelist_lock; + u8 open_buckets_freelist; + u8 open_buckets_nr_free; struct closure_waitlist open_buckets_wait; - spinlock_t open_buckets_lock; struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; struct write_point btree_write_point; struct write_point write_points[WRITE_POINT_COUNT]; - struct write_point promote_write_point; - - /* - * This write point is used for migrating data off a device - * and can point to any other device. - * We can't use the normal write points because those will - * gang up n replicas, and for migration we want only one new - * replica. - */ - struct write_point migration_write_point; + struct hlist_head write_points_hash[WRITE_POINT_COUNT]; + struct mutex write_points_hash_lock; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; @@ -721,9 +652,14 @@ struct bch_fs { atomic64_t key_version; - struct bio_list read_retry_list; - struct work_struct read_retry_work; - spinlock_t read_retry_lock; + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + + struct bio_list btree_write_error_list; + struct work_struct btree_write_error_work; + spinlock_t btree_write_error_lock; /* ERRORS */ struct list_head fsck_errors; @@ -736,6 +672,9 @@ struct bch_fs { unsigned writeback_pages_max; atomic_long_t nr_inodes; + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + /* DEBUG JUNK */ struct dentry *debug; struct btree_debug btree_debug[BTREE_ID_NR]; @@ -761,19 +700,14 @@ struct bch_fs { /* The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; + atomic_long_t extent_migrate_done; + atomic_long_t extent_migrate_raced; unsigned btree_gc_periodic:1; - unsigned foreground_write_ratelimit_enabled:1; unsigned copy_gc_enabled:1; unsigned tiering_enabled:1; unsigned tiering_percent; - /* - * foreground writes will be throttled when the number of free - * buckets is below this percentage - */ - unsigned foreground_target_percent; - #define BCH_DEBUG_PARAM(name, description) bool name; BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM @@ -784,14 +718,17 @@ struct bch_fs { #undef BCH_TIME_STAT }; -static inline bool bch2_fs_running(struct bch_fs *c) +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { - return c->state == BCH_FS_RO || c->state == BCH_FS_RW; +#ifndef NO_BCACHEFS_FS + if (c->vfs_sb) + c->vfs_sb->s_bdi->ra_pages = ra_pages; +#endif } -static inline unsigned bucket_pages(const struct bch_dev *ca) +static inline bool bch2_fs_running(struct bch_fs *c) { - return ca->mi.bucket_size / PAGE_SECTORS; + return c->state == BCH_FS_RO || c->state == BCH_FS_RW; } static inline unsigned bucket_bytes(const struct bch_dev *ca) @@ -801,7 +738,7 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca) static inline unsigned block_bytes(const struct bch_fs *c) { - return c->sb.block_size << 9; + return c->opts.block_size << 9; } #endif /* _BCACHEFS_H */