Update bcachefs sources to 02ae70070a bcachefs: Allocate new btree roots lazily

[bcachefs-tools-debian] / libbcachefs / bcachefs.h
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h

index ab99af7b3a24d075731b805e14de32e60d471daf..78c427fa17a63bb7c42706751afa7954e7c2e6ca 100644 (file)
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -251,9 +251,6 @@ do {                                                                        \
         BCH_DEBUG_PARAM(debug_check_bkeys,                              \
                 "Run bkey_debugcheck (primarily checking GC/allocation "\
                 "information) when iterating over keys")                \
-       BCH_DEBUG_PARAM(version_stress_test,                            \
-               "Assigns random version numbers to newly written "      \
-               "extents, to test overlapping extent cases")            \
         BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
                 "Reread btree nodes at various points to verify the "   \
                 "mergesort in the read path against modifications "     \
@@ -282,15 +279,11 @@ do {                                                                      \
  #include "alloc_types.h"
  #include "buckets_types.h"
  #include "clock_types.h"
-#include "io_types.h"
  #include "journal_types.h"
  #include "keylist_types.h"
-#include "move_types.h"
+#include "quota_types.h"
  #include "super_types.h"
  
-/* 256k, in sectors */
-#define BTREE_NODE_SIZE_MAX            512
-
  /*
   * Number of nodes we might have to allocate in a worst case btree split
   * operation - we split all the way up to the root, then allocate a new root.
@@ -312,8 +305,9 @@ struct crypto_blkcipher;
  struct crypto_ahash;
  
  enum gc_phase {
-       GC_PHASE_SB_METADATA            = BTREE_ID_NR + 1,
+       GC_PHASE_SB             = BTREE_ID_NR + 1,
         GC_PHASE_PENDING_DELETE,
+       GC_PHASE_ALLOC,
         GC_PHASE_DONE
  };
  
@@ -323,35 +317,16 @@ struct gc_pos {
         unsigned                level;
  };
  
-struct bch_member_cpu {
-       u64                     nbuckets;       /* device size */
-       u16                     first_bucket;   /* index of first bucket used */
-       u16                     bucket_size;    /* sectors */
-       u8                      state;
-       u8                      tier;
-       u8                      replacement;
-       u8                      discard;
-       u8                      valid;
-};
-
-struct bch_replicas_cpu_entry {
-       u8                      data_type;
-       u8                      devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-       struct rcu_head         rcu;
-       unsigned                nr;
-       unsigned                entry_size;
-       struct bch_replicas_cpu_entry entries[];
+struct io_count {
+       u64                     sectors[2][BCH_DATA_NR];
  };
  
  struct bch_dev {
         struct kobject          kobj;
         struct percpu_ref       ref;
+       struct completion       ref_completion;
         struct percpu_ref       io_ref;
-       struct completion       stop_complete;
-       struct completion       offline_complete;
+       struct completion       io_ref_completion;
  
         struct bch_fs           *fs;
  
@@ -364,16 +339,31 @@ struct bch_dev {
         uuid_le                 uuid;
         char                    name[BDEVNAME_SIZE];
  
-       struct bcache_superblock disk_sb;
+       struct bch_sb_handle    disk_sb;
+       int                     sb_write_error;
  
-       struct dev_group        self;
+       struct bch_devs_mask    self;
  
-       /* biosets used in cloned bios for replicas and moving_gc */
+       /* biosets used in cloned bios for writing multiple replicas */
         struct bio_set          replica_set;
  
-       struct task_struct      *alloc_thread;
+       /*
+        * Buckets:
+        * Per-bucket arrays are protected by c->usage_lock, bucket_lock and
+        * gc_lock, for device resize - holding any is sufficient for access:
+        * Or rcu_read_lock(), but only for ptr_stale():
+        */
+       struct bucket_array __rcu *buckets;
+       unsigned long           *buckets_dirty;
+       /* most out of date gen in the btree */
+       u8                      *oldest_gens;
+       struct rw_semaphore     bucket_lock;
  
-       bool                    need_alloc_write;
+       struct bch_dev_usage __percpu *usage_percpu;
+       struct bch_dev_usage    usage_cached;
+
+       /* Allocator: */
+       struct task_struct      *alloc_thread;
  
         /*
          * free: Buckets that are ready to be used
@@ -384,31 +374,19 @@ struct bch_dev {
          * gens/prios, they'll be moved to the free list (and possibly discarded
          * in the process)
          */
-       DECLARE_FIFO(long, free)[RESERVE_NR];
-       DECLARE_FIFO(long, free_inc);
+       alloc_fifo              free[RESERVE_NR];
+       alloc_fifo              free_inc;
         spinlock_t              freelist_lock;
-       bool                    alloc_thread_started;
-
-       size_t                  fifo_last_bucket;
+       unsigned                nr_invalidated;
  
-       /* Allocation stuff: */
+       u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
+       unsigned                open_buckets_partial_nr;
  
-       /* most out of date gen in the btree */
-       u8                      *oldest_gens;
-       struct bucket           *buckets;
-       unsigned short          bucket_bits;    /* ilog2(bucket_size) */
+       size_t                  fifo_last_bucket;
  
         /* last calculated minimum prio */
         u16                     min_prio[2];
  
-       /*
-        * Bucket book keeping. The first element is updated by GC, the
-        * second contains a saved copy of the stats from the beginning
-        * of GC.
-        */
-       struct bch_dev_usage __percpu *usage_percpu;
-       struct bch_dev_usage    usage_cached;
-
         atomic_long_t           saturated_count;
         size_t                  inc_gen_needs_gc;
         size_t                  inc_gen_really_needs_gc;
@@ -416,16 +394,11 @@ struct bch_dev {
         bool                    allocator_invalidating_data;
  
         alloc_heap              alloc_heap;
-       bucket_heap             copygc_heap;
-
-       /* Moving GC: */
-       struct task_struct      *moving_gc_read;
-
-       struct bch_pd_controller moving_gc_pd;
-
-       /* Tiering: */
-       struct write_point      tiering_write_point;
  
+       /* Copying GC: */
+       struct task_struct      *copygc_thread;
+       copygc_heap             copygc_heap;
+       struct bch_pd_controller copygc_pd;
         struct write_point      copygc_write_point;
  
         struct journal_device   journal;
@@ -433,9 +406,9 @@ struct bch_dev {
         struct work_struct      io_error_work;
  
         /* The rest of this all shows up in sysfs */
-       atomic64_t              meta_sectors_written;
-       atomic64_t              btree_sectors_written;
-       u64 __percpu            *sectors_written;
+       atomic_t                latency[2];
+
+       struct io_count __percpu *io_done;
  };
  
  /*
@@ -447,17 +420,28 @@ struct bch_dev {
   * won't automatically reattach).
   */
  enum {
+       /* startup: */
+       BCH_FS_BRAND_NEW_FS,
+       BCH_FS_ALLOC_READ_DONE,
+       BCH_FS_ALLOCATOR_STARTED,
         BCH_FS_INITIAL_GC_DONE,
+       BCH_FS_FSCK_DONE,
+
+       /* shutdown: */
         BCH_FS_EMERGENCY_RO,
         BCH_FS_WRITE_DISABLE_COMPLETE,
         BCH_FS_GC_STOPPING,
+
+       /* errors: */
+       BCH_FS_ERROR,
         BCH_FS_GC_FAILURE,
+
+       /* misc: */
         BCH_FS_BDEV_MOUNTED,
-       BCH_FS_ERROR,
         BCH_FS_FSCK_FIXED_ERRORS,
-       BCH_FS_FSCK_DONE,
         BCH_FS_FIXED_GENS,
         BCH_FS_REBUILD_REPLICAS,
+       BCH_FS_HOLD_BTREE_WRITES,
  };
  
  struct btree_debug {
@@ -472,7 +456,8 @@ struct bch_tier {
         struct task_struct      *migrate;
         struct bch_pd_controller pd;
  
-       struct dev_group        devs;
+       struct bch_devs_mask    devs;
+       struct write_point      wp;
  };
  
  enum bch_fs_state {
@@ -518,13 +503,11 @@ struct bch_fs {
                 uuid_le         uuid;
                 uuid_le         user_uuid;
  
-               u16             block_size;
-               u16             btree_node_size;
+               u16             encoded_extent_max;
  
                 u8              nr_devices;
                 u8              clean;
  
-               u8              str_hash_type;
                 u8              encryption_type;
  
                 u64             time_base_lo;
@@ -540,48 +523,14 @@ struct bch_fs {
         struct closure          sb_write;
         struct mutex            sb_lock;
  
-       struct backing_dev_info bdi;
-
         /* BTREE CACHE */
-       struct bio_set          btree_read_bio;
+       struct bio_set          btree_bio;
  
         struct btree_root       btree_roots[BTREE_ID_NR];
+       bool                    btree_roots_dirty;
         struct mutex            btree_root_lock;
  
-       bool                    btree_cache_table_init_done;
-       struct rhashtable       btree_cache_table;
-
-       /*
-        * We never free a struct btree, except on shutdown - we just put it on
-        * the btree_cache_freed list and reuse it later. This simplifies the
-        * code, and it doesn't cost us much memory as the memory usage is
-        * dominated by buffers that hold the actual btree node data and those
-        * can be freed - and the number of struct btrees allocated is
-        * effectively bounded.
-        *
-        * btree_cache_freeable effectively is a small cache - we use it because
-        * high order page allocations can be rather expensive, and it's quite
-        * common to delete and allocate btree nodes in quick succession. It
-        * should never grow past ~2-3 nodes in practice.
-        */
-       struct mutex            btree_cache_lock;
-       struct list_head        btree_cache;
-       struct list_head        btree_cache_freeable;
-       struct list_head        btree_cache_freed;
-
-       /* Number of elements in btree_cache + btree_cache_freeable lists */
-       unsigned                btree_cache_used;
-       unsigned                btree_cache_reserve;
-       struct shrinker         btree_cache_shrink;
-
-       /*
-        * If we need to allocate memory for a new btree node and that
-        * allocation fails, we can cannibalize another node in the btree cache
-        * to satisfy the allocation - lock to guarantee only one thread does
-        * this at a time:
-        */
-       struct closure_waitlist mca_wait;
-       struct task_struct      *btree_cache_alloc_lock;
+       struct btree_cache      btree_cache;
  
         mempool_t               btree_reserve_pool;
  
@@ -592,10 +541,7 @@ struct bch_fs {
          * when allocating btree reserves fail halfway through) - instead, we
          * can stick them here:
          */
-       struct btree_alloc {
-               struct open_bucket      *ob;
-               BKEY_PADDED(k);
-       }                       btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+       struct btree_alloc      btree_reserve_cache[BTREE_NODE_RESERVE * 2];
         unsigned                btree_reserve_cache_nr;
         struct mutex            btree_reserve_cache_lock;
  
@@ -608,22 +554,18 @@ struct bch_fs {
         struct workqueue_struct *copygc_wq;
  
         /* ALLOCATION */
-       struct bch_pd_controller foreground_write_pd;
         struct delayed_work     pd_controllers_update;
         unsigned                pd_controllers_update_seconds;
-       spinlock_t              foreground_write_pd_lock;
-       struct bch_write_op     *write_wait_head;
-       struct bch_write_op     *write_wait_tail;
  
-       struct timer_list       foreground_write_wakeup;
  
         /*
          * These contain all r/w devices - i.e. devices we can currently
          * allocate from:
          */
-       struct dev_group        all_devs;
+       struct bch_devs_mask    rw_devs[BCH_DATA_NR];
         struct bch_tier         tiers[BCH_TIER_MAX];
         /* NULL if we only have devices in one tier: */
+       struct bch_devs_mask    *fastest_devs;
         struct bch_tier         *fastest_tier;
  
         u64                     capacity; /* sectors */
@@ -641,8 +583,6 @@ struct bch_fs {
         struct bch_fs_usage     usage_cached;
         struct lglock           usage_lock;
  
-       struct mutex            bucket_lock;
-
         struct closure_waitlist freelist_wait;
  
         /*
@@ -655,27 +595,18 @@ struct bch_fs {
  
         struct io_clock         io_clock[2];
  
-       /* SECTOR ALLOCATOR */
-       struct list_head        open_buckets_open;
-       struct list_head        open_buckets_free;
-       unsigned                open_buckets_nr_free;
+       /* ALLOCATOR */
+       spinlock_t              freelist_lock;
+       u8                      open_buckets_freelist;
+       u8                      open_buckets_nr_free;
         struct closure_waitlist open_buckets_wait;
-       spinlock_t              open_buckets_lock;
         struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
  
         struct write_point      btree_write_point;
  
         struct write_point      write_points[WRITE_POINT_COUNT];
-       struct write_point      promote_write_point;
-
-       /*
-        * This write point is used for migrating data off a device
-        * and can point to any other device.
-        * We can't use the normal write points because those will
-        * gang up n replicas, and for migration we want only one new
-        * replica.
-        */
-       struct write_point      migration_write_point;
+       struct hlist_head       write_points_hash[WRITE_POINT_COUNT];
+       struct mutex            write_points_hash_lock;
  
         /* GARBAGE COLLECTION */
         struct task_struct      *gc_thread;
@@ -721,9 +652,14 @@ struct bch_fs {
  
         atomic64_t              key_version;
  
-       struct bio_list         read_retry_list;
-       struct work_struct      read_retry_work;
-       spinlock_t              read_retry_lock;
+       /* VFS IO PATH - fs-io.c */
+       struct bio_set          writepage_bioset;
+       struct bio_set          dio_write_bioset;
+       struct bio_set          dio_read_bioset;
+
+       struct bio_list         btree_write_error_list;
+       struct work_struct      btree_write_error_work;
+       spinlock_t              btree_write_error_lock;
  
         /* ERRORS */
         struct list_head        fsck_errors;
@@ -736,6 +672,9 @@ struct bch_fs {
         unsigned                writeback_pages_max;
         atomic_long_t           nr_inodes;
  
+       /* QUOTAS */
+       struct bch_memquota_type quotas[QTYP_NR];
+
         /* DEBUG JUNK */
         struct dentry           *debug;
         struct btree_debug      btree_debug[BTREE_ID_NR];
@@ -761,19 +700,14 @@ struct bch_fs {
  
         /* The rest of this all shows up in sysfs */
         atomic_long_t           read_realloc_races;
+       atomic_long_t           extent_migrate_done;
+       atomic_long_t           extent_migrate_raced;
  
         unsigned                btree_gc_periodic:1;
-       unsigned                foreground_write_ratelimit_enabled:1;
         unsigned                copy_gc_enabled:1;
         unsigned                tiering_enabled:1;
         unsigned                tiering_percent;
  
-       /*
-        * foreground writes will be throttled when the number of free
-        * buckets is below this percentage
-        */
-       unsigned                foreground_target_percent;
-
  #define BCH_DEBUG_PARAM(name, description) bool name;
         BCH_DEBUG_PARAMS_ALL()
  #undef BCH_DEBUG_PARAM
@@ -784,14 +718,17 @@ struct bch_fs {
  #undef BCH_TIME_STAT
  };
  
-static inline bool bch2_fs_running(struct bch_fs *c)
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
  {
-       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+#ifndef NO_BCACHEFS_FS
+       if (c->vfs_sb)
+               c->vfs_sb->s_bdi->ra_pages = ra_pages;
+#endif
  }
  
-static inline unsigned bucket_pages(const struct bch_dev *ca)
+static inline bool bch2_fs_running(struct bch_fs *c)
  {
-       return ca->mi.bucket_size / PAGE_SECTORS;
+       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
  }
  
  static inline unsigned bucket_bytes(const struct bch_dev *ca)
@@ -801,7 +738,7 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca)
  
  static inline unsigned block_bytes(const struct bch_fs *c)
  {
-       return c->sb.block_size << 9;
+       return c->opts.block_size << 9;
  }
  
  #endif /* _BCACHEFS_H */