]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 90d78c2461 bcachefs: Option parsing for io targets
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 19 Feb 2018 02:43:46 +0000 (21:43 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Tue, 20 Feb 2018 00:01:05 +0000 (19:01 -0500)
31 files changed:
.bcachefs_revision
include/linux/sched/cputime.h [new file with mode: 0644]
include/trace/events/bcachefs.h
libbcachefs/alloc.c
libbcachefs/alloc.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_update_interior.c
libbcachefs/chardev.c
libbcachefs/compress.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tier.c
libbcachefs/tier.h
libbcachefs/xattr.c

index 76acdf9367a3954946f1902ad691f692c116fae4..d29d45d4fa911d11a8b3c28308646cce0fb98d92 100644 (file)
@@ -1 +1 @@
-e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5
+90d78c246188f4e90bd9ceb29fe95186b7dc680d
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
new file mode 100644 (file)
index 0000000..a89c626
--- /dev/null
@@ -0,0 +1,6 @@
+
+static inline void task_cputime_adjusted(struct task_struct *p, u64 *utime, u64 *stime)
+{
+       *utime = 0;
+       *stime = 0;
+}
index d132dd8a75e6ccbc024395ab521cd1a2df4d2eec..a7be2d8222d89b3a48637045d4e2033811d097bb 100644 (file)
@@ -49,15 +49,13 @@ DECLARE_EVENT_CLASS(bch_dev,
 
        TP_STRUCT__entry(
                __array(char,           uuid,   16      )
-               __field(unsigned,       tier            )
        ),
 
        TP_fast_assign(
                memcpy(__entry->uuid, ca->uuid.b, 16);
-               __entry->tier = ca->mi.tier;
        ),
 
-       TP_printk("%pU tier %u", __entry->uuid, __entry->tier)
+       TP_printk("%pU", __entry->uuid)
 );
 
 DECLARE_EVENT_CLASS(bch_fs,
index 339ffd02c45f5a7e8a21d60e37b4cf7470246eb2..a76f2b7cc48a22a708600972ae8248ce15f84f9b 100644 (file)
@@ -89,69 +89,29 @@ static void pd_controllers_update(struct work_struct *work)
                                           struct bch_fs,
                                           pd_controllers_update);
        struct bch_dev *ca;
-       unsigned i, iter;
-
-       /* All units are in bytes */
-       u64 faster_tiers_size   = 0;
-       u64 faster_tiers_dirty  = 0;
-
-       u64 copygc_can_free     = 0;
-
-       rcu_read_lock();
-       for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
-               bch2_pd_controller_update(&c->tiers[i].pd,
-                               div_u64(faster_tiers_size *
-                                       c->tiering_percent, 100),
-                               faster_tiers_dirty,
-                               -1);
-
-               for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) {
-                       struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
-
-                       u64 size = bucket_to_sector(ca, ca->mi.nbuckets -
-                                       ca->mi.first_bucket) << 9;
-                       u64 dirty = bucket_to_sector(ca,
-                                       stats.buckets[BCH_DATA_USER]) << 9;
-                       u64 free = bucket_to_sector(ca,
-                                       __dev_buckets_free(ca, stats)) << 9;
-                       /*
-                        * Bytes of internal fragmentation, which can be
-                        * reclaimed by copy GC
-                        */
-                       s64 fragmented = (bucket_to_sector(ca,
-                                               stats.buckets[BCH_DATA_USER] +
-                                               stats.buckets[BCH_DATA_CACHED]) -
-                                         (stats.sectors[BCH_DATA_USER] +
-                                          stats.sectors[BCH_DATA_CACHED])) << 9;
+       unsigned i;
 
-                       fragmented = max(0LL, fragmented);
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 
-                       bch2_pd_controller_update(&ca->copygc_pd,
-                                                free, fragmented, -1);
+               u64 free = bucket_to_sector(ca,
+                               __dev_buckets_free(ca, stats)) << 9;
+               /*
+                * Bytes of internal fragmentation, which can be
+                * reclaimed by copy GC
+                */
+               s64 fragmented = (bucket_to_sector(ca,
+                                       stats.buckets[BCH_DATA_USER] +
+                                       stats.buckets[BCH_DATA_CACHED]) -
+                                 (stats.sectors[BCH_DATA_USER] +
+                                  stats.sectors[BCH_DATA_CACHED])) << 9;
 
-                       faster_tiers_size               += size;
-                       faster_tiers_dirty              += dirty;
+               fragmented = max(0LL, fragmented);
 
-                       copygc_can_free                 += fragmented;
-               }
+               bch2_pd_controller_update(&ca->copygc_pd,
+                                        free, fragmented, -1);
        }
 
-       rcu_read_unlock();
-
-       /*
-        * Throttle foreground writes if tier 0 is running out of free buckets,
-        * and either tiering or copygc can free up space.
-        *
-        * Target will be small if there isn't any work to do - we don't want to
-        * throttle foreground writes if we currently have all the free space
-        * we're ever going to have.
-        *
-        * Otherwise, if there's work to do, try to keep 20% of tier0 available
-        * for foreground writes.
-        */
-       if (c->fastest_tier)
-               copygc_can_free = U64_MAX;
-
        schedule_delayed_work(&c->pd_controllers_update,
                              c->pd_controllers_update_seconds * HZ);
 }
@@ -1201,22 +1161,14 @@ out:
        return ob - c->open_buckets;
 }
 
-static int __dev_alloc_cmp(struct bch_fs *c,
-                          struct write_point *wp,
+static int __dev_alloc_cmp(struct write_point *wp,
                           unsigned l, unsigned r)
 {
-       struct bch_dev *ca_l = rcu_dereference(c->devs[l]);
-       struct bch_dev *ca_r = rcu_dereference(c->devs[r]);
-
-       if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier)
-               return ((ca_l->mi.tier > ca_r->mi.tier) -
-                       (ca_l->mi.tier < ca_r->mi.tier));
-
        return ((wp->next_alloc[l] > wp->next_alloc[r]) -
                (wp->next_alloc[l] < wp->next_alloc[r]));
 }
 
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r)
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
 
 struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
                                         struct write_point *wp,
@@ -1355,7 +1307,7 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
 
 static void writepoint_drop_ptrs(struct bch_fs *c,
                                 struct write_point *wp,
-                                struct bch_devs_mask *devs,
+                                u16 target, bool in_target,
                                 unsigned nr_ptrs_dislike)
 {
        int i;
@@ -1367,7 +1319,8 @@ static void writepoint_drop_ptrs(struct bch_fs *c,
                struct open_bucket *ob = wp->ptrs[i];
                struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
-               if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
+               if (nr_ptrs_dislike &&
+                   dev_in_target(ca, target) == in_target) {
                        BUG_ON(ca->open_buckets_partial_nr >=
                               ARRAY_SIZE(ca->open_buckets_partial));
 
@@ -1401,7 +1354,7 @@ static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
 }
 
 static int open_bucket_add_buckets(struct bch_fs *c,
-                                  struct bch_devs_mask *_devs,
+                                  u16 target,
                                   struct write_point *wp,
                                   struct bch_devs_list *devs_have,
                                   unsigned nr_replicas,
@@ -1422,8 +1375,15 @@ static int open_bucket_add_buckets(struct bch_fs *c,
        writepoint_for_each_ptr(wp, ob, i)
                __clear_bit(ob->ptr.dev, devs.d);
 
-       if (_devs)
-               bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
+       if (target) {
+               const struct bch_devs_mask *t;
+
+               rcu_read_lock();
+               t = bch2_target_to_mask(c, target);
+               if (t)
+                       bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+               rcu_read_unlock();
+       }
 
        return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
 }
@@ -1503,7 +1463,7 @@ out:
  * Get us an open_bucket we can allocate from, return with it locked:
  */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-                               struct bch_devs_mask *devs,
+                               unsigned target,
                                struct write_point_specifier write_point,
                                struct bch_devs_list *devs_have,
                                unsigned nr_replicas,
@@ -1525,17 +1485,27 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
        writepoint_for_each_ptr(wp, ob, i)
                if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev))
                        nr_ptrs_have++;
-               else if (devs && !test_bit(ob->ptr.dev, devs->d))
+               else if (!dev_in_target(c->devs[ob->ptr.dev], target))
                        nr_ptrs_dislike++;
 
-       ret = open_bucket_add_buckets(c, devs, wp, devs_have,
+       ret = open_bucket_add_buckets(c, target, wp, devs_have,
                                nr_replicas + nr_ptrs_have + nr_ptrs_dislike,
                                reserve, cl);
        if (ret && ret != -EROFS)
                goto err;
 
-       if (wp->nr_ptrs <
-           nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) {
+       if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
+               goto alloc_done;
+
+       ret = open_bucket_add_buckets(c, target, wp, devs_have,
+                               nr_replicas + nr_ptrs_have,
+                               reserve, cl);
+       if (ret && ret != -EROFS)
+               goto err;
+alloc_done:
+       if (wp->nr_ptrs - nr_ptrs_have -
+           ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
+           < nr_replicas_required) {
                ret = -EROFS;
                goto err;
        }
@@ -1545,7 +1515,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
                                          0, nr_ptrs_dislike);
 
        /* Remove pointers we don't want to use: */
-       writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike);
+       writepoint_drop_ptrs(c, wp, target, false, nr_ptrs_dislike);
 
        /*
         * Move pointers to devices we already have to end of open bucket
@@ -1637,7 +1607,6 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 
 void bch2_recalc_capacity(struct bch_fs *c)
 {
-       struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
        struct bch_dev *ca;
        u64 total_capacity, capacity = 0, reserved_sectors = 0;
        unsigned long ra_pages = 0;
@@ -1653,28 +1622,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
        bch2_set_ra_pages(c, ra_pages);
 
-       /* Find fastest, slowest tiers with devices: */
-
-       for (tier = c->tiers;
-            tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-               if (!dev_mask_nr(&tier->devs))
-                       continue;
-               if (!fastest_tier)
-                       fastest_tier = tier;
-               slowest_tier = tier;
-       }
-
-       c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
-       c->fastest_devs = fastest_tier != slowest_tier ? &fastest_tier->devs : NULL;
-
-       if (!fastest_tier)
-               goto set_capacity;
-
-       /*
-        * Capacity of the filesystem is the capacity of all the devices in the
-        * slowest (highest) tier - we don't include lower tier devices.
-        */
-       for_each_member_device_rcu(ca, c, i, &slowest_tier->devs) {
+       for_each_rw_member(ca, c, i) {
                size_t reserve = 0;
 
                /*
@@ -1700,16 +1648,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
                reserve += ARRAY_SIZE(c->write_points);
 
-               if (ca->mi.tier)
-                       reserve += 1;   /* tiering write point */
-               reserve += 1;           /* btree write point */
+               reserve += 1;   /* btree write point */
 
                reserved_sectors += bucket_to_sector(ca, reserve);
 
                capacity += bucket_to_sector(ca, ca->mi.nbuckets -
                                             ca->mi.first_bucket);
        }
-set_capacity:
+
        total_capacity = capacity;
 
        capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1745,7 +1691,8 @@ static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
        bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
 
        mutex_lock(&wp->lock);
-       writepoint_drop_ptrs(c, wp, &not_self, wp->nr_ptrs);
+       writepoint_drop_ptrs(c, wp, dev_to_target(ca->dev_idx),
+                            true, wp->nr_ptrs);
        mutex_unlock(&wp->lock);
 }
 
@@ -1776,7 +1723,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 
        /* First, remove device from allocation groups: */
 
-       clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
        for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
                clear_bit(ca->dev_idx, c->rw_devs[i].d);
 
@@ -1790,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
                bch2_stop_write_point(c, ca, &c->write_points[i]);
 
        bch2_stop_write_point(c, ca, &ca->copygc_write_point);
-       bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
+       bch2_stop_write_point(c, ca, &c->rebalance_write_point);
        bch2_stop_write_point(c, ca, &c->btree_write_point);
 
        mutex_lock(&c->btree_reserve_cache_lock);
@@ -1828,7 +1774,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
        for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
                if (ca->mi.data_allowed & (1 << i))
                        set_bit(ca->dev_idx, c->rw_devs[i].d);
-       set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
 }
 
 /* stop allocator thread: */
@@ -2059,7 +2004,6 @@ void bch2_fs_allocator_init(struct bch_fs *c)
 {
        struct open_bucket *ob;
        struct write_point *wp;
-       unsigned i;
 
        mutex_init(&c->write_points_hash_lock);
        spin_lock_init(&c->freelist_lock);
@@ -2079,9 +2023,7 @@ void bch2_fs_allocator_init(struct bch_fs *c)
        }
 
        writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-
-       for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
-               writepoint_init(&c->tiers[i].wp, BCH_DATA_USER);
+       writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
 
        for (wp = c->write_points;
             wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
index 3bdc294691def2e98652e86b1acc9c6823eaf200..5b58922344a8e2f60a587cb55cd84a784e3b4571 100644 (file)
@@ -66,7 +66,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 }
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-                                            struct bch_devs_mask *,
+                                            unsigned,
                                             struct write_point_specifier,
                                             struct bch_devs_list *,
                                             unsigned, unsigned,
index 5a3e99b3b79a53ed770e4edc0b4c11a21832448b..75f3a006127957aa561891da62a82e3438d514a8 100644 (file)
@@ -408,6 +408,8 @@ struct bch_dev {
        struct bch_pd_controller copygc_pd;
        struct write_point      copygc_write_point;
 
+       atomic64_t              rebalance_work;
+
        struct journal_device   journal;
 
        struct work_struct      io_error_work;
@@ -458,15 +460,6 @@ struct btree_debug {
        struct dentry           *failed;
 };
 
-struct bch_tier {
-       unsigned                idx;
-       struct task_struct      *migrate;
-       struct bch_pd_controller pd;
-
-       struct bch_devs_mask    devs;
-       struct write_point      wp;
-};
-
 enum bch_fs_state {
        BCH_FS_STARTING         = 0,
        BCH_FS_STOPPING,
@@ -522,6 +515,7 @@ struct bch_fs {
                u64             time_base_lo;
                u32             time_base_hi;
                u32             time_precision;
+               u64             features;
        }                       sb;
 
        struct bch_sb           *disk_sb;
@@ -569,16 +563,13 @@ struct bch_fs {
        struct delayed_work     pd_controllers_update;
        unsigned                pd_controllers_update_seconds;
 
+       /* REBALANCE */
+       struct task_struct      *rebalance_thread;
+       struct bch_pd_controller rebalance_pd;
+
+       atomic64_t              rebalance_work_unknown_dev;
 
-       /*
-        * These contain all r/w devices - i.e. devices we can currently
-        * allocate from:
-        */
        struct bch_devs_mask    rw_devs[BCH_DATA_NR];
-       struct bch_tier         tiers[BCH_TIER_MAX];
-       /* NULL if we only have devices in one tier: */
-       struct bch_devs_mask    *fastest_devs;
-       struct bch_tier         *fastest_tier;
 
        u64                     capacity; /* sectors */
 
@@ -615,6 +606,7 @@ struct bch_fs {
        struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
 
        struct write_point      btree_write_point;
+       struct write_point      rebalance_write_point;
 
        struct write_point      write_points[WRITE_POINT_COUNT];
        struct hlist_head       write_points_hash[WRITE_POINT_COUNT];
@@ -717,8 +709,8 @@ struct bch_fs {
 
        unsigned                btree_gc_periodic:1;
        unsigned                copy_gc_enabled:1;
-       unsigned                tiering_enabled:1;
-       unsigned                tiering_percent;
+       unsigned                rebalance_enabled:1;
+       unsigned                rebalance_percent;
 
 #define BCH_DEBUG_PARAM(name, description) bool name;
        BCH_DEBUG_PARAMS_ALL()
index 5e406275d5f688defcc9733af3ac44825ae22b45..0f2c9cecda724234121e6024eb23fd03e723f5e5 100644 (file)
@@ -608,12 +608,22 @@ BKEY_VAL_TYPE(inode_generation,   BCH_INODE_GENERATION);
        BCH_INODE_FIELD(bi_dev,                         32)     \
        BCH_INODE_FIELD(bi_data_checksum,               8)      \
        BCH_INODE_FIELD(bi_compression,                 8)      \
-       BCH_INODE_FIELD(bi_project,                     32)
+       BCH_INODE_FIELD(bi_project,                     32)     \
+       BCH_INODE_FIELD(bi_background_compression,      8)      \
+       BCH_INODE_FIELD(bi_data_replicas,               8)      \
+       BCH_INODE_FIELD(bi_promote_target,              16)     \
+       BCH_INODE_FIELD(bi_foreground_target,           16)     \
+       BCH_INODE_FIELD(bi_background_target,           16)
 
 #define BCH_INODE_FIELDS_INHERIT()                             \
        BCH_INODE_FIELD(bi_data_checksum)                       \
        BCH_INODE_FIELD(bi_compression)                         \
-       BCH_INODE_FIELD(bi_project)
+       BCH_INODE_FIELD(bi_project)                             \
+       BCH_INODE_FIELD(bi_background_compression)              \
+       BCH_INODE_FIELD(bi_data_replicas)                       \
+       BCH_INODE_FIELD(bi_promote_target)                      \
+       BCH_INODE_FIELD(bi_foreground_target)                   \
+       BCH_INODE_FIELD(bi_background_target)
 
 enum {
        /*
@@ -814,13 +824,14 @@ struct bch_member {
 };
 
 LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
-LE64_BITMASK(BCH_MEMBER_TIER,          struct bch_member, flags[0],  4,  8)
-/* 8-10 unused, was HAS_(META)DATA */
+/* 4-10 unused, was TIER, HAS_(META)DATA */
 LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
 LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
 
+#define BCH_TIER_MAX                   4U
+
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
 LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
@@ -834,8 +845,6 @@ enum bch_member_state {
        BCH_MEMBER_STATE_NR             = 4,
 };
 
-#define BCH_TIER_MAX                   4U
-
 enum cache_replacement {
        CACHE_REPLACEMENT_LRU           = 0,
        CACHE_REPLACEMENT_FIFO          = 1,
@@ -1077,6 +1086,12 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
 
 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+                                       struct bch_sb, flags[1], 28, 32);
+
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET,    struct bch_sb, flags[1], 28, 40);
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
 
 /* Features: */
 enum bch_sb_features {
index 0e0156d9016c733376082afaeabd463dfe6142e8..f42239dab71cd00d11d8250ab5c839bdb119892d 100644 (file)
@@ -348,7 +348,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c, NULL,
+       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
                                      writepoint_ptr(&c->btree_write_point),
                                      &devs_have,
                                      res->nr_replicas,
index 5ff90cc0015fca9e13afbfde0dd6dd649ebac4b6..ab6dc665186e7bc42c200e6b66e24584d47bf769 100644 (file)
@@ -40,27 +40,15 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
                if (!ca)
                        return ERR_PTR(-EINVAL);
        } else {
-               struct block_device *bdev;
                char *path;
-               unsigned i;
 
                path = strndup_user((const char __user *)
                                    (unsigned long) dev, PATH_MAX);
                if (IS_ERR(path))
                        return ERR_CAST(path);
 
-               bdev = lookup_bdev(path);
+               ca = bch2_dev_lookup(c, path);
                kfree(path);
-               if (IS_ERR(bdev))
-                       return ERR_CAST(bdev);
-
-               for_each_member_device(ca, c, i)
-                       if (ca->disk_sb.bdev == bdev)
-                               goto found;
-
-               ca = ERR_PTR(-ENOENT);
-found:
-               bdput(bdev);
        }
 
        return ca;
index 7726cfd8cfacaa06889281a7e2193c5422dc0ecf..18c945985636282e85cb2c68e89dfb07f6167af3 100644 (file)
@@ -360,6 +360,9 @@ static unsigned __bio_compress(struct bch_fs *c,
        unsigned pad;
        int ret = 0;
 
+       BUG_ON(compression_type >= BCH_COMPRESSION_NR);
+       BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
+
        /* If it's only one block, don't bother trying to compress: */
        if (bio_sectors(src) <= c->opts.block_size)
                return 0;
@@ -465,6 +468,8 @@ unsigned bch2_bio_compress(struct bch_fs *c,
        return compression_type;
 }
 
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
+
 #define BCH_FEATURE_NONE       0
 
 static const unsigned bch2_compression_opt_to_feature[] = {
@@ -475,29 +480,42 @@ static const unsigned bch2_compression_opt_to_feature[] = {
 
 #undef BCH_FEATURE_NONE
 
-/* doesn't write superblock: */
-int bch2_check_set_has_compressed_data(struct bch_fs *c,
-                                     unsigned compression_type)
+int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 {
-       unsigned f;
        int ret = 0;
 
-       pr_verbose_init(c->opts, "");
+       if ((c->sb.features & f) == f)
+               return 0;
 
-       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+       mutex_lock(&c->sb_lock);
 
-       if (!compression_type)
-               goto out;
+       if ((c->sb.features & f) == f) {
+               mutex_unlock(&c->sb_lock);
+               return 0;
+       }
 
-       f = bch2_compression_opt_to_feature[compression_type];
-       if (bch2_sb_test_feature(c->disk_sb, f))
-               goto out;
+       ret = __bch2_fs_compress_init(c, c->sb.features|f);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ret;
+       }
 
-       bch2_sb_set_feature(c->disk_sb, f);
-       ret = bch2_fs_compress_init(c);
-out:
-       pr_verbose_init(c->opts, "ret %i", ret);
-       return ret;
+       c->disk_sb->features[0] |= cpu_to_le64(f);
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
+                                      unsigned compression_type)
+{
+       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+       return compression_type
+               ? __bch2_check_set_has_compressed_data(c,
+                               1ULL << bch2_compression_opt_to_feature[compression_type])
+               : 0;
 }
 
 void bch2_fs_compress_exit(struct bch_fs *c)
@@ -531,7 +549,7 @@ static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
                : 0;
 }
 
-int bch2_fs_compress_init(struct bch_fs *c)
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
        size_t max_extent = c->sb.encoded_extent_max << 9;
        size_t order = get_order(max_extent);
@@ -561,7 +579,7 @@ int bch2_fs_compress_init(struct bch_fs *c)
        for (i = compression_types;
             i < compression_types + ARRAY_SIZE(compression_types);
             i++)
-               if (bch2_sb_test_feature(c->disk_sb, i->feature))
+               if (features & (1 << i->feature))
                        goto have_compressed;
 
        goto out;
@@ -587,7 +605,7 @@ have_compressed:
                decompress_workspace_size =
                        max(decompress_workspace_size, i->decompress_workspace);
 
-               if (!bch2_sb_test_feature(c->disk_sb, i->feature))
+               if (!(features & (1 << i->feature)))
                        continue;
 
                if (i->decompress_workspace)
@@ -609,3 +627,17 @@ out:
        pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
 }
+
+int bch2_fs_compress_init(struct bch_fs *c)
+{
+       u64 f = c->sb.features;
+
+       if (c->opts.compression)
+               f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
+
+       if (c->opts.background_compression)
+               f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
+
+       return __bch2_fs_compress_init(c, f);
+
+}
index ce1f8ba230356b532f273074e95bb215bf823cb9..37470f86e588f15d20410a66645109cff1560352 100644 (file)
@@ -1766,7 +1766,6 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
        unsigned seq, stale;
        char buf[160];
        bool bad;
-       unsigned ptrs_per_tier[BCH_TIER_MAX];
        unsigned replicas = 0;
 
        /*
@@ -1778,12 +1777,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
         * going to get overwritten during replay)
         */
 
-       memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
-
        extent_for_each_ptr(e, ptr) {
                ca = bch_dev_bkey_exists(c, ptr->dev);
                replicas++;
-               ptrs_per_tier[ca->mi.tier]++;
 
                /*
                 * If journal replay hasn't finished, we might be seeing keys
@@ -1886,12 +1882,6 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
 #undef p
 }
 
-static unsigned PTR_TIER(struct bch_fs *c,
-                        const struct bch_extent_ptr *ptr)
-{
-       return bch_dev_bkey_exists(c, ptr->dev)->mi.tier;
-}
-
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
                                 struct bch_extent_crc_unpacked new)
 {
@@ -2014,45 +2004,31 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                                      struct bkey_s_extent e,
-                                     unsigned nr_desired_replicas)
+                                     unsigned nr_desired_replicas,
+                                     unsigned target)
 {
        struct bch_extent_ptr *ptr;
-       unsigned tier = 0, nr_cached = 0;
-       unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
-       bool have_higher_tier;
+       unsigned nr_cached = 0, nr_good = bch2_extent_nr_good_ptrs(c, e.c);
 
        if (nr_good <= nr_desired_replicas)
                return;
 
        nr_cached = nr_good - nr_desired_replicas;
 
-       do {
-               have_higher_tier = false;
-
-               extent_for_each_ptr(e, ptr) {
-                       if (!ptr->cached &&
-                           PTR_TIER(c, ptr) == tier) {
-                               ptr->cached = true;
-                               nr_cached--;
-                               if (!nr_cached)
-                                       return;
-                       }
-
-                       if (PTR_TIER(c, ptr) > tier)
-                               have_higher_tier = true;
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached &&
+                   !dev_in_target(c->devs[ptr->dev], target)) {
+                       ptr->cached = true;
+                       nr_cached--;
+                       if (!nr_cached)
+                               return;
                }
-
-               tier++;
-       } while (have_higher_tier);
 }
 
 /*
- * This picks a non-stale pointer, preferabbly from a device other than
- * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
- * pointers to other devices, it will still pick a pointer from avoid.
- * Note that it prefers lowered-numbered pointers to higher-numbered pointers
- * as the pointers are sorted by tier, hence preferring pointers to tier 0
- * rather than pointers to tier 1.
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
  */
 void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
                          struct bch_devs_mask *avoid,
index 75579273fae8ea159bf3eea7570a545752cb9050..83c0f24db58885335c4f8001d0b996468145ba2f 100644 (file)
@@ -39,7 +39,7 @@ bch2_insert_fixup_extent(struct btree_insert *,
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
-                                     unsigned);
+                                     unsigned, unsigned);
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
index 00475b99dff83a50ab221c737321c3a77dca61da..46cffc5c9b7a7f76528f1f25956590287bb67a27 100644 (file)
@@ -504,10 +504,8 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
        op->unalloc             = false;
        op->new_i_size          = U64_MAX;
 
-       bch2_write_op_init(&op->op, c);
-       op->op.csum_type        = bch2_data_checksum_type(c, opts.data_checksum);
-       op->op.compression_type = bch2_compression_opt_to_type[opts.compression];
-       op->op.devs             = c->fastest_devs;
+       bch2_write_op_init(&op->op, c, opts);
+       op->op.target           = opts.foreground_target;
        op->op.index_update_fn  = bchfs_write_index_update;
        op_journal_seq_set(&op->op, &inode->ei_journal_seq);
 }
@@ -615,8 +613,14 @@ static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *in
                                     struct page *page, bool check_enospc)
 {
        struct bch_page_state *s = page_state(page), new, old;
+
+       /* XXX: this should not be open coded */
+       unsigned nr_replicas = inode->ei_inode.bi_data_replicas
+               ? inode->ei_inode.bi_data_replicas - 1
+               : c->opts.data_replicas;
+
        struct disk_reservation disk_res = bch2_disk_reservation_init(c,
-                                               READ_ONCE(c->opts.data_replicas));
+                                               nr_replicas);
        struct quota_res quota_res = { 0 };
        int ret = 0;
 
@@ -1894,7 +1898,7 @@ static int bch2_direct_IO_write(struct kiocb *req,
                goto err;
 
        ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
-                                       c->opts.data_replicas, 0);
+                                       dio->iop.op.opts.data_replicas, 0);
        if (unlikely(ret)) {
                if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
                                                      offset >> 9),
@@ -2351,7 +2355,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
        loff_t block_start, block_end;
        loff_t end = offset + len;
        unsigned sectors;
-       unsigned replicas = READ_ONCE(c->opts.data_replicas);
+       unsigned replicas = io_opts(c, inode).data_replicas;
        int ret;
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
index 80962b5d16a10348fad375f1de1abd1bb3da9358..c7e842ee84375082b4449fc0a53a0fd43d55518e 100644 (file)
@@ -1266,6 +1266,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
 {
        struct bch_fs *c = root->d_sb->s_fs_info;
        enum bch_opt_id i;
+       char buf[512];
 
        for (i = 0; i < bch2_opts_nr; i++) {
                const struct bch_option *opt = &bch2_opt_table[i];
@@ -1277,17 +1278,10 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
                        continue;
 
-               switch (opt->type) {
-               case BCH_OPT_BOOL:
-                       seq_printf(seq, ",%s%s", v ? "" : "no", opt->attr.name);
-                       break;
-               case BCH_OPT_UINT:
-                       seq_printf(seq, ",%s=%llu", opt->attr.name, v);
-                       break;
-               case BCH_OPT_STR:
-                       seq_printf(seq, ",%s=%s", opt->attr.name, opt->choices[v]);
-                       break;
-               }
+               bch2_opt_to_text(c, buf, sizeof(buf), opt, v,
+                                OPT_SHOW_MOUNT_STYLE);
+               seq_putc(seq, ',');
+               seq_puts(seq, buf);
        }
 
        return 0;
index 13495d487a686b5b4db16e632ea98a773adce5ef..6624d8af574f7d40e817974d852faa98bd09f928 100644 (file)
@@ -22,6 +22,7 @@
 #include "move.h"
 #include "super.h"
 #include "super-io.h"
+#include "tier.h"
 
 #include <linux/blkdev.h>
 #include <linux/random.h>
@@ -220,9 +221,9 @@ int bch2_write_index_default(struct bch_write_op *op)
                             BTREE_ITER_INTENT);
 
        ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
-                                      NULL, op_journal_seq(op),
-                                      BTREE_INSERT_NOFAIL|
-                                      BTREE_INSERT_USE_RESERVE);
+                                       NULL, op_journal_seq(op),
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_USE_RESERVE);
        bch2_btree_iter_unlock(&iter);
 
        return ret;
@@ -238,7 +239,7 @@ static void bch2_write_index(struct closure *cl)
        struct keylist *keys = &op->insert_keys;
        struct bkey_s_extent e;
        struct bch_extent_ptr *ptr;
-       struct bkey_i *src, *dst = keys->keys, *n;
+       struct bkey_i *src, *dst = keys->keys, *n, *k;
        int ret;
 
        op->flags |= BCH_WRITE_LOOPED;
@@ -268,6 +269,14 @@ static void bch2_write_index(struct closure *cl)
 
        keys->top = dst;
 
+       /*
+        * probably not the ideal place to hook this in, but I don't
+        * particularly want to plumb io_opts all the way through the btree
+        * update stack right now
+        */
+       for_each_keylist_key(keys, k)
+               bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+
        if (!bch2_keylist_empty(keys)) {
                u64 sectors_start = keylist_sectors(keys);
                int ret = op->index_update_fn(op);
@@ -735,7 +744,7 @@ static void __bch2_write(struct closure *cl)
                        continue_at(cl, bch2_write_index, index_update_wq(op));
 
                wp = bch2_alloc_sectors_start(c,
-                       op->devs,
+                       op->target,
                        op->write_point,
                        &op->devs_have,
                        op->nr_replicas,
@@ -935,29 +944,32 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
        memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
               sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 
-       ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs,
-                                     writepoint_hashed((unsigned long) current),
-                                     rbio->opts,
-                                     DATA_PROMOTE,
-                                     (struct data_opts) { 0 },
-                                     k);
+       ret = bch2_migrate_write_init(c, &op->write,
+                       writepoint_hashed((unsigned long) current),
+                       rbio->opts,
+                       DATA_PROMOTE,
+                       (struct data_opts) {
+                               .target = rbio->opts.promote_target
+                       },
+                       k);
        BUG_ON(ret);
 
        return op;
 }
 
-/* only promote if we're not reading from the fastest tier: */
-static bool should_promote(struct bch_fs *c,
-                          struct extent_pick_ptr *pick, unsigned flags)
+static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
+                          unsigned flags, u16 target)
 {
+       if (!target)
+               return false;
+
        if (!(flags & BCH_READ_MAY_PROMOTE))
                return false;
 
        if (percpu_ref_is_dying(&c->writes))
                return false;
 
-       return c->fastest_tier &&
-               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+       return bch2_extent_has_target(c, e, target);
 }
 
 /* Read */
@@ -1323,7 +1335,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
                bounce = true;
        }
 
-       promote = should_promote(c, pick, flags);
+       promote = should_promote(c, e, flags, orig->opts.promote_target);
        /* could also set read_full */
        if (promote)
                bounce = true;
index 4208fd4385bf058dc63fb2eb6de2c12620254c72..bf0b17e1deb9626d14ef0c18a6868ec59be96804 100644 (file)
@@ -61,24 +61,25 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 
 int bch2_write_index_default(struct bch_write_op *);
 
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+                                     struct bch_io_opts opts)
 {
        op->c                   = c;
        op->io_wq               = index_update_wq(op);
        op->flags               = 0;
        op->written             = 0;
        op->error               = 0;
-       op->csum_type           = bch2_data_checksum_type(c, c->opts.data_checksum);
-       op->compression_type    =
-               bch2_compression_opt_to_type[c->opts.compression];
+       op->csum_type           = bch2_data_checksum_type(c, opts.data_checksum);
+       op->compression_type    = bch2_compression_opt_to_type[opts.compression];
        op->nr_replicas         = 0;
        op->nr_replicas_required = c->opts.data_replicas_required;
        op->alloc_reserve       = RESERVE_NONE;
        op->open_buckets_nr     = 0;
        op->devs_have.nr        = 0;
+       op->target              = 0;
+       op->opts                = opts;
        op->pos                 = POS_MAX;
        op->version             = ZERO_VERSION;
-       op->devs                = NULL;
        op->write_point         = (struct write_point_specifier) { 0 };
        op->res                 = (struct disk_reservation) { 0 };
        op->journal_seq         = 0;
index 32ecac24228829e05f5180bbcd89b29be7fc02b1..a022ab3354287c77509d15c3d2868910bead01da 100644 (file)
@@ -103,13 +103,14 @@ struct bch_write_op {
        u16                     target;
        u16                     nonce;
 
+       struct bch_io_opts      opts;
+
        struct bpos             pos;
        struct bversion         version;
 
        /* For BCH_WRITE_DATA_ENCODED: */
        struct bch_extent_crc_unpacked crc;
 
-       struct bch_devs_mask    *devs;
        struct write_point_specifier write_point;
 
        struct disk_reservation res;
index a176484ae91d00d8c01936fe1169a91fe3fbb5b7..a7c4c3ac1da5cc1fa83b642967f6814dc767c849 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
+#define SECTORS_IN_FLIGHT_PER_DEVICE   2048
+
 struct moving_io {
        struct list_head        list;
        struct closure          cl;
        bool                    read_completed;
-       unsigned                sectors;
+
+       unsigned                read_dev;
+       unsigned                read_sectors;
+       unsigned                write_sectors;
 
        struct bch_read_bio     rbio;
 
@@ -34,7 +39,11 @@ struct moving_context {
        struct bch_move_stats   *stats;
 
        struct list_head        reads;
-       atomic_t                sectors_in_flight;
+
+       /* in flight sectors: */
+       atomic_t                read_sectors[BCH_SB_MEMBERS_MAX];
+       atomic_t                write_sectors;
+
        wait_queue_head_t       wait;
 };
 
@@ -116,7 +125,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                (struct bch_extent_crc_unpacked) { 0 });
                bch2_extent_normalize(c, extent_i_to_s(insert).s);
                bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
-                                                c->opts.data_replicas);
+                                                op->opts.background_target,
+                                                op->opts.data_replicas);
 
                /*
                 * It's possible we race, and for whatever reason the extent now
@@ -206,7 +216,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
 }
 
 int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
-                           struct bch_devs_mask *devs,
                            struct write_point_specifier wp,
                            struct bch_io_opts io_opts,
                            enum data_cmd data_cmd,
@@ -219,11 +228,11 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
        m->data_opts    = data_opts;
        m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
 
-       bch2_write_op_init(&m->op, c);
-       m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum);
+       bch2_write_op_init(&m->op, c, io_opts);
        m->op.compression_type =
-               bch2_compression_opt_to_type[io_opts.compression];
-       m->op.devs      = devs;
+               bch2_compression_opt_to_type[io_opts.background_compression ?:
+                                            io_opts.compression];
+       m->op.target    = data_opts.target,
        m->op.write_point = wp;
 
        if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
@@ -241,8 +250,8 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 
        switch (data_cmd) {
        case DATA_ADD_REPLICAS:
-               if (m->nr_ptrs_reserved < c->opts.data_replicas) {
-                       m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved;
+               if (m->nr_ptrs_reserved < io_opts.data_replicas) {
+                       m->op.nr_replicas = io_opts.data_replicas - m->nr_ptrs_reserved;
 
                        ret = bch2_disk_reservation_get(c, &m->op.res,
                                                        k.k->size,
@@ -250,7 +259,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
                        if (ret)
                                return ret;
 
-                       m->nr_ptrs_reserved = c->opts.data_replicas;
+                       m->nr_ptrs_reserved = io_opts.data_replicas;
                }
                break;
        case DATA_REWRITE:
@@ -279,19 +288,29 @@ static void move_free(struct closure *cl)
                if (bv->bv_page)
                        __free_page(bv->bv_page);
 
-       atomic_sub(io->sectors, &ctxt->sectors_in_flight);
        wake_up(&ctxt->wait);
 
        kfree(io);
 }
 
+static void move_write_done(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, cl);
+
+       atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
+       closure_return_with_destructor(cl, move_free);
+}
+
 static void move_write(struct closure *cl)
 {
        struct moving_io *io = container_of(cl, struct moving_io, cl);
 
        if (likely(!io->rbio.bio.bi_status)) {
                bch2_migrate_read_done(&io->write, &io->rbio);
+
+               atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
                closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+               continue_at(cl, move_write_done, NULL);
        }
 
        closure_return_with_destructor(cl, move_free);
@@ -310,16 +329,46 @@ static void move_read_endio(struct bio *bio)
        struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
        struct moving_context *ctxt = io->write.ctxt;
 
+       atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
        io->read_completed = true;
+
        if (next_pending_write(ctxt))
                wake_up(&ctxt->wait);
 
        closure_put(&ctxt->cl);
 }
 
+static void do_pending_writes(struct moving_context *ctxt)
+{
+       struct moving_io *io;
+
+       while ((io = next_pending_write(ctxt))) {
+               list_del(&io->list);
+               closure_call(&io->cl, move_write, NULL, &ctxt->cl);
+       }
+}
+
+#define move_ctxt_wait_event(_ctxt, _cond)                     \
+do {                                                           \
+       do_pending_writes(_ctxt);                               \
+                                                               \
+       if (_cond)                                              \
+               break;                                          \
+       __wait_event((_ctxt)->wait,                             \
+                    next_pending_write(_ctxt) || (_cond));     \
+} while (1)
+
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+{
+       unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
+
+       move_ctxt_wait_event(ctxt,
+               !atomic_read(&ctxt->write_sectors) ||
+               atomic_read(&ctxt->write_sectors) != sectors_pending);
+}
+
 static int bch2_move_extent(struct bch_fs *c,
                            struct moving_context *ctxt,
-                           struct bch_devs_mask *devs,
                            struct write_point_specifier wp,
                            struct bch_io_opts io_opts,
                            struct bkey_s_c_extent e,
@@ -333,10 +382,18 @@ static int bch2_move_extent(struct bch_fs *c,
        unsigned sectors = e.k->size, pages;
        int ret = -ENOMEM;
 
+       move_ctxt_wait_event(ctxt,
+               atomic_read(&ctxt->write_sectors) <
+               SECTORS_IN_FLIGHT_PER_DEVICE);
+
        bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
        if (IS_ERR_OR_NULL(pick.ca))
                return pick.ca ? PTR_ERR(pick.ca) : 0;
 
+       move_ctxt_wait_event(ctxt,
+               atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
+               SECTORS_IN_FLIGHT_PER_DEVICE);
+
        /* write path might have to decompress data: */
        extent_for_each_ptr_crc(e, ptr, crc)
                sectors = max_t(unsigned, sectors, crc.uncompressed_size);
@@ -347,8 +404,10 @@ static int bch2_move_extent(struct bch_fs *c,
        if (!io)
                goto err;
 
-       io->write.ctxt  = ctxt;
-       io->sectors     = e.k->size;
+       io->write.ctxt          = ctxt;
+       io->read_dev            = pick.ca->dev_idx;
+       io->read_sectors        = pick.crc.uncompressed_size;
+       io->write_sectors       = e.k->size;
 
        bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
        bio_set_prio(&io->write.op.wbio.bio,
@@ -368,8 +427,8 @@ static int bch2_move_extent(struct bch_fs *c,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       ret = bch2_migrate_write_init(c, &io->write, devs, wp,
-                                     io_opts, data_cmd, data_opts, e.s_c);
+       ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
+                                     data_cmd, data_opts, e.s_c);
        if (ret)
                goto err_free_pages;
 
@@ -378,7 +437,7 @@ static int bch2_move_extent(struct bch_fs *c,
 
        trace_move_extent(e.k);
 
-       atomic_add(io->sectors, &ctxt->sectors_in_flight);
+       atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
        list_add_tail(&io->list, &ctxt->reads);
 
        /*
@@ -398,39 +457,8 @@ err:
        return ret;
 }
 
-static void do_pending_writes(struct moving_context *ctxt)
-{
-       struct moving_io *io;
-
-       while ((io = next_pending_write(ctxt))) {
-               list_del(&io->list);
-               closure_call(&io->cl, move_write, NULL, &ctxt->cl);
-       }
-}
-
-#define move_ctxt_wait_event(_ctxt, _cond)                     \
-do {                                                           \
-       do_pending_writes(_ctxt);                               \
-                                                               \
-       if (_cond)                                              \
-               break;                                          \
-       __wait_event((_ctxt)->wait,                             \
-                    next_pending_write(_ctxt) || (_cond));     \
-} while (1)
-
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
-{
-       unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
-
-       move_ctxt_wait_event(ctxt,
-               !atomic_read(&ctxt->sectors_in_flight) ||
-               atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
-}
-
 int bch2_move_data(struct bch_fs *c,
                   struct bch_ratelimit *rate,
-                  unsigned sectors_in_flight,
-                  struct bch_devs_mask *devs,
                   struct write_point_specifier wp,
                   struct bpos start,
                   struct bpos end,
@@ -460,13 +488,6 @@ int bch2_move_data(struct bch_fs *c,
                bch2_ratelimit_reset(rate);
 
        while (!kthread || !(ret = kthread_should_stop())) {
-               if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
-                       bch2_btree_iter_unlock(&stats->iter);
-                       move_ctxt_wait_event(&ctxt,
-                                            atomic_read(&ctxt.sectors_in_flight) <
-                                            sectors_in_flight);
-               }
-
                if (rate &&
                    bch2_ratelimit_delay(rate) &&
                    (bch2_btree_iter_unlock(&stats->iter),
@@ -519,7 +540,7 @@ peek:
                k = bkey_i_to_s_c(&tmp.k);
                bch2_btree_iter_unlock(&stats->iter);
 
-               ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts,
+               ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
                                        bkey_s_c_to_extent(k),
                                        data_cmd, data_opts);
                if (ret2) {
@@ -545,11 +566,10 @@ next_nondata:
 
        bch2_btree_iter_unlock(&stats->iter);
 
-       move_ctxt_wait_event(&ctxt, !atomic_read(&ctxt.sectors_in_flight));
+       move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
        closure_sync(&ctxt.cl);
 
-       EBUG_ON(!list_empty(&ctxt.reads));
-       EBUG_ON(atomic_read(&ctxt.sectors_in_flight));
+       EBUG_ON(atomic_read(&ctxt.write_sectors));
 
        trace_move_data(c,
                        atomic64_read(&stats->sectors_moved),
@@ -671,11 +691,12 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
        unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
        unsigned replicas = type == BKEY_TYPE_BTREE
                ? c->opts.metadata_replicas
-               : c->opts.data_replicas;
+               : io_opts->data_replicas;
 
        if (!nr_good || nr_good >= replicas)
                return DATA_SKIP;
 
+       data_opts->target               = 0;
        data_opts->btree_insert_flags = 0;
        return DATA_ADD_REPLICAS;
 }
@@ -691,6 +712,7 @@ static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
        if (!bch2_extent_has_device(e, op->migrate.dev))
                return DATA_SKIP;
 
+       data_opts->target               = 0;
        data_opts->btree_insert_flags   = 0;
        data_opts->rewrite_dev          = op->migrate.dev;
        return DATA_REWRITE;
@@ -710,8 +732,7 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
                ret = bch2_gc_btree_replicas(c) ?: ret;
 
-               ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
-                                    NULL,
+               ret = bch2_move_data(c, NULL,
                                     writepoint_hashed((unsigned long) current),
                                     op.start,
                                     op.end,
@@ -728,8 +749,7 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
                ret = bch2_gc_btree_replicas(c) ?: ret;
 
-               ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
-                                    NULL,
+               ret = bch2_move_data(c, NULL,
                                     writepoint_hashed((unsigned long) current),
                                     op.start,
                                     op.end,
index 819e5d9f0a24234b800c144fa43445898be3501b..bc98f94bb23d8b267974f864b6a80d64411e2c73 100644 (file)
@@ -17,6 +17,7 @@ enum data_cmd {
 };
 
 struct data_opts {
+       u16             target;
        unsigned        rewrite_dev;
        int             btree_insert_flags;
 };
@@ -38,14 +39,11 @@ struct migrate_write {
 
 void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
 int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-                           struct bch_devs_mask *,
                            struct write_point_specifier,
                            struct bch_io_opts,
                            enum data_cmd, struct data_opts,
                            struct bkey_s_c);
 
-#define SECTORS_IN_FLIGHT_PER_DEVICE   2048
-
 typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
                                enum bkey_type, struct bkey_s_c_extent,
                                struct bch_io_opts *, struct data_opts *);
@@ -61,7 +59,6 @@ struct bch_move_stats {
 };
 
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
-                  unsigned, struct bch_devs_mask *,
                   struct write_point_specifier,
                   struct bpos, struct bpos,
                   move_pred_fn, void *,
index c306a89f8401f160dd3204c75cdc57fb17a730da..ad56e039163b07c289a98c6180d5c3c472c5cf68 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
+#include <linux/sched/task.h>
 #include <linux/sort.h>
 #include <linux/wait.h>
 
@@ -94,7 +95,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
        if (!__copygc_pred(ca, e))
                return DATA_SKIP;
 
-       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE,
+       data_opts->target               = dev_to_target(ca->dev_idx);
+       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
        data_opts->rewrite_dev          = ca->dev_idx;
        return DATA_REWRITE;
 }
@@ -178,8 +180,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
                        bucket_offset_cmp, NULL);
 
        ret = bch2_move_data(c, &ca->copygc_pd.rate,
-                            SECTORS_IN_FLIGHT_PER_DEVICE,
-                            &ca->self,
                             writepoint_ptr(&ca->copygc_write_point),
                             POS_MIN, POS_MAX,
                             copygc_pred, ca,
@@ -248,8 +248,10 @@ void bch2_copygc_stop(struct bch_dev *ca)
        ca->copygc_pd.rate.rate = UINT_MAX;
        bch2_ratelimit_reset(&ca->copygc_pd.rate);
 
-       if (ca->copygc_thread)
+       if (ca->copygc_thread) {
                kthread_stop(ca->copygc_thread);
+               put_task_struct(ca->copygc_thread);
+       }
        ca->copygc_thread = NULL;
 }
 
@@ -269,6 +271,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
        if (IS_ERR(t))
                return PTR_ERR(t);
 
+       get_task_struct(t);
+
        ca->copygc_thread = t;
        wake_up_process(ca->copygc_thread);
 
index ec50345fda622dabdafe6bf7f550baeff995ab52..326b8ad9caf20d5dce19ed60a8307c538aa99a49 100644 (file)
@@ -1,7 +1,9 @@
 
 #include <linux/kernel.h>
 
+#include "bcachefs.h"
 #include "opts.h"
+#include "super-io.h"
 #include "util.h"
 
 const char * const bch2_error_actions[] = {
@@ -139,6 +141,9 @@ const struct bch_option bch2_opt_table[] = {
 #define OPT_BOOL()             .type = BCH_OPT_BOOL
 #define OPT_UINT(_min, _max)   .type = BCH_OPT_UINT, .min = _min, .max = _max
 #define OPT_STR(_choices)      .type = BCH_OPT_STR, .choices = _choices
+#define OPT_FN(_fn)            .type = BCH_OPT_FN,                     \
+                               .parse = _fn##_parse,                   \
+                               .print = _fn##_print
 
 #define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default)         \
        [Opt_##_name] = {                                               \
@@ -189,7 +194,8 @@ static int bch2_mount_opt_lookup(const char *name)
        return bch2_opt_lookup(name);
 }
 
-int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res)
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+                  const char *val, u64 *res)
 {
        ssize_t ret;
 
@@ -217,11 +223,50 @@ int bch2_opt_parse(const struct bch_option *opt, const char *val, u64 *res)
 
                *res = ret;
                break;
+       case BCH_OPT_FN:
+               if (!c)
+                       return -EINVAL;
+
+               return opt->parse(c, val, res);
        }
 
        return 0;
 }
 
+int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
+                    const struct bch_option *opt, u64 v,
+                    unsigned flags)
+{
+       char *out = buf, *end = buf + len;
+
+       if (flags & OPT_SHOW_MOUNT_STYLE) {
+               if (opt->type == BCH_OPT_BOOL)
+                       return scnprintf(out, end - out, "%s%s",
+                                        v ? "" : "no",
+                                        opt->attr.name);
+
+               out += scnprintf(out, end - out, "%s=", opt->attr.name);
+       }
+
+       switch (opt->type) {
+       case BCH_OPT_BOOL:
+       case BCH_OPT_UINT:
+               out += scnprintf(out, end - out, "%lli", v);
+               break;
+       case BCH_OPT_STR:
+               out += (flags & OPT_SHOW_FULL_LIST)
+                       ? bch2_scnprint_string_list(out, end - out, opt->choices, v)
+                       : scnprintf(out, end - out, opt->choices[v]);
+               break;
+       case BCH_OPT_FN:
+               return opt->print(c, out, end - out, v);
+       default:
+               BUG();
+       }
+
+       return out - buf;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
        char *opt, *name, *val;
@@ -237,7 +282,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
                        if (id < 0)
                                goto bad_opt;
 
-                       ret = bch2_opt_parse(&bch2_opt_table[id], val, &v);
+                       ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
                        if (ret < 0)
                                goto bad_val;
                } else {
index 8a3ac66b948ce0f60b2a90f3a04ea1ffaa029075..e7ab8870d3ac2a35b60fc42ad0bd0b0278045d43 100644 (file)
@@ -42,6 +42,7 @@ enum opt_type {
        BCH_OPT_BOOL,
        BCH_OPT_UINT,
        BCH_OPT_STR,
+       BCH_OPT_FN,
 };
 
 /**
@@ -94,9 +95,21 @@ enum opt_type {
        BCH_OPT(compression,            u8,     OPT_RUNTIME,            \
                OPT_STR(bch2_compression_types),                        \
                BCH_SB_COMPRESSION_TYPE,        BCH_COMPRESSION_OPT_NONE)\
+       BCH_OPT(background_compression, u8,     OPT_RUNTIME,            \
+               OPT_STR(bch2_compression_types),                        \
+               BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
        BCH_OPT(str_hash,               u8,     OPT_RUNTIME,            \
                OPT_STR(bch2_str_hash_types),                           \
                BCH_SB_STR_HASH_TYPE,           BCH_STR_HASH_SIPHASH)   \
+       BCH_OPT(foreground_target,      u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_FOREGROUND_TARGET,       0)                      \
+       BCH_OPT(background_target,      u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_BACKGROUND_TARGET,       0)                      \
+       BCH_OPT(promote_target,         u16,    OPT_RUNTIME,            \
+               OPT_FN(bch2_opt_target),                                \
+               BCH_SB_PROMOTE_TARGET,  0)                              \
        BCH_OPT(inodes_32bit,           u8,     OPT_RUNTIME,            \
                OPT_BOOL(),                                             \
                BCH_SB_INODE_32BIT,             false)                  \
@@ -205,6 +218,8 @@ enum bch_opt_id {
        bch2_opts_nr
 };
 
+struct bch_fs;
+
 struct bch_option {
        struct attribute        attr;
        void                    (*set_sb)(struct bch_sb *, u64);
@@ -218,6 +233,10 @@ struct bch_option {
        struct {
                const char * const *choices;
        };
+       struct {
+               int (*parse)(struct bch_fs *, const char *, u64 *);
+               int (*print)(struct bch_fs *, char *, size_t, u64);
+       };
        };
 
 };
@@ -231,14 +250,26 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 struct bch_opts bch2_opts_from_sb(struct bch_sb *);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(const struct bch_option *, const char *, u64 *);
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+
+#define OPT_SHOW_FULL_LIST     (1 << 0)
+#define OPT_SHOW_MOUNT_STYLE   (1 << 1)
+
+int bch2_opt_to_text(struct bch_fs *, char *, size_t,
+                    const struct bch_option *, u64, unsigned);
+
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
 
 #define BCH_INODE_OPTS()                                       \
        BCH_INODE_OPT(data_checksum,                    8)      \
-       BCH_INODE_OPT(compression,                      8)
+       BCH_INODE_OPT(compression,                      8)      \
+       BCH_INODE_OPT(background_compression,           8)      \
+       BCH_INODE_OPT(data_replicas,                    8)      \
+       BCH_INODE_OPT(promote_target,                   16)     \
+       BCH_INODE_OPT(foreground_target,                16)     \
+       BCH_INODE_OPT(background_target,                16)
 
 struct bch_io_opts {
 #define BCH_INODE_OPT(_name, _bits)    unsigned _name##_defined:1;
index c747391707b3f5a1e210fb836ed11ce71f643e13..69101f3a68a5e8b1fd2574118c789498e3938d51 100644 (file)
@@ -400,6 +400,7 @@ static void bch2_sb_update(struct bch_fs *c)
        c->sb.time_base_lo      = le64_to_cpu(src->time_base_lo);
        c->sb.time_base_hi      = le32_to_cpu(src->time_base_hi);
        c->sb.time_precision    = le32_to_cpu(src->time_precision);
+       c->sb.features          = le64_to_cpu(src->features[0]);
 
        for_each_member_device(ca, c, i)
                ca->mi = bch2_mi_to_cpu(mi->members + i);
@@ -1600,24 +1601,22 @@ static const char *bch2_sb_validate_quota(struct bch_sb *sb,
 
 /* Disk groups: */
 
-#if 0
-static size_t trim_nulls(const char *str, size_t len)
+static int strcmp_void(const void *l, const void *r)
 {
-       while (len && !str[len - 1])
-               --len;
-       return len;
+       return strcmp(l, r);
 }
-#endif
 
 static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
                                                struct bch_sb_field *f)
 {
        struct bch_sb_field_disk_groups *groups =
                field_to_type(f, disk_groups);
+       struct bch_disk_group *g;
        struct bch_sb_field_members *mi;
        struct bch_member *m;
-       struct bch_disk_group *g;
-       unsigned nr_groups;
+       unsigned i, nr_groups, nr_live = 0, len;
+       char **labels, *l;
+       const char *err = NULL;
 
        mi              = bch2_sb_get_members(sb);
        groups          = bch2_sb_get_disk_groups(sb);
@@ -1626,32 +1625,57 @@ static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
        for (m = mi->members;
             m < mi->members + sb->nr_devices;
             m++) {
+               unsigned g;
+
                if (!BCH_MEMBER_GROUP(m))
                        continue;
 
-               if (BCH_MEMBER_GROUP(m) >= nr_groups)
-                       return "disk has invalid group";
+               g = BCH_MEMBER_GROUP(m) - 1;
 
-               g = &groups->entries[BCH_MEMBER_GROUP(m)];
-               if (BCH_GROUP_DELETED(g))
+               if (g >= nr_groups ||
+                   BCH_GROUP_DELETED(&groups->entries[g]))
                        return "disk has invalid group";
        }
-#if 0
-       if (!groups)
+
+       if (!nr_groups)
                return NULL;
 
-       char **labels;
        labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
        if (!labels)
                return "cannot allocate memory";
 
-       for (g = groups->groups;
-            g < groups->groups + nr_groups;
+       for (g = groups->entries;
+            g < groups->entries + nr_groups;
             g++) {
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               len = strnlen(g->label, sizeof(g->label));
 
+               labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL);
+               if (!l) {
+                       err = "cannot allocate memory";
+                       goto err;
+               }
+
+               memcpy(l, g->label, len);
+               l[len] = '\0';
        }
-#endif
-       return NULL;
+
+       sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL);
+
+       for (i = 0; i + 1 < nr_live; i++)
+               if (!strcmp(labels[i], labels[i + 1])) {
+                       err = "duplicate group labels";
+                       goto err;
+               }
+
+       err = NULL;
+err:
+       for (i = 0; i < nr_live; i++)
+               kfree(labels[i]);
+       kfree(labels);
+       return err;
 }
 
 static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
@@ -1692,7 +1716,11 @@ static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
                if (!bch2_member_exists(m))
                        continue;
 
-               __set_bit(i, dst->devs.d);
+               dst = BCH_MEMBER_GROUP(m)
+                       ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1]
+                       : NULL;
+               if (dst)
+                       __set_bit(i, dst->devs.d);
        }
 
        old_g = c->disk_groups;
@@ -1708,18 +1736,140 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe
        struct target t = target_decode(target);
 
        switch (t.type) {
-       case TARGET_DEV:
-               BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]);
-               return &c->devs[t.dev]->self;
+       case TARGET_DEV: {
+               struct bch_dev *ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+               return ca ? &ca->self : NULL;
+       }
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+               return t.group < g->nr && !g->entries[t.group].deleted
+                       ? &g->entries[t.group].devs
+                       : NULL;
+       }
+       default:
+               BUG();
+       }
+}
+
+int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+                          const char *name)
+{
+       unsigned i, nr_groups = disk_groups_nr(groups);
+       unsigned len = strlen(name);
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *g = groups->entries + i;
+
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               if (strnlen(g->label, sizeof(g->label)) == len &&
+                   !memcmp(name, g->label, len))
+                       return i;
+       }
+
+       return -1;
+}
+
+static int bch2_disk_group_find(struct bch_fs *c, const char *name)
+{
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       ret = __bch2_disk_group_find(bch2_sb_get_disk_groups(c->disk_sb), name);
+       mutex_unlock(&c->sb_lock);
+
+       return ret;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+       struct bch_dev *ca;
+       int g;
+
+       if (!strlen(buf) || !strcmp(buf, "none")) {
+               *v = 0;
+               return 0;
+       }
+
+       /* Is it a device? */
+       ca = bch2_dev_lookup(c, buf);
+       if (!IS_ERR(ca)) {
+               *v = dev_to_target(ca->dev_idx);
+               percpu_ref_put(&ca->ref);
+               return 0;
+       }
+
+       g = bch2_disk_group_find(c, buf);
+       if (g >= 0) {
+               *v = group_to_target(g);
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+       struct target t = target_decode(v);
+       int ret;
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return scnprintf(buf, len, "none");
+       case TARGET_DEV: {
+               struct bch_dev *ca;
+
+               rcu_read_lock();
+               ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+
+               if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                       char b[BDEVNAME_SIZE];
+
+                       ret = scnprintf(buf, len, "/dev/%s",
+                                       bdevname(ca->disk_sb.bdev, b));
+                       percpu_ref_put(&ca->io_ref);
+               } else if (ca) {
+                       ret = scnprintf(buf, len, "offline device %u", t.dev);
+               } else {
+                       ret = scnprintf(buf, len, "invalid device %u", t.dev);
+               }
+
+               rcu_read_unlock();
+               break;
+       }
        case TARGET_GROUP: {
-               struct bch_disk_groups_cpu *g =
-                       rcu_dereference(c->disk_groups);
+               struct bch_sb_field_disk_groups *groups;
+               struct bch_disk_group *g;
+
+               mutex_lock(&c->sb_lock);
+               groups = bch2_sb_get_disk_groups(c->disk_sb);
+
+               g = t.group < disk_groups_nr(groups)
+                       ? groups->entries + t.group
+                       : NULL;
+
+               if (g && !BCH_GROUP_DELETED(g)) {
+                       ret = len ? min(len - 1, strnlen(g->label, sizeof(g->label))) : 0;
 
-               /* XXX: what to do here? */
-               BUG_ON(t.group >= g->nr || g->entries[t.group].deleted);
-               return &g->entries[t.group].devs;
+                       memcpy(buf, g->label, ret);
+                       if (len)
+                               buf[ret] = '\0';
+               } else {
+                       ret = scnprintf(buf, len, "invalid group %u", t.group);
+               }
+
+               mutex_unlock(&c->sb_lock);
+               break;
        }
        default:
                BUG();
        }
+
+       return ret;
 }
index d7fecf02f81cb019e9fa548be43337368fecfed2..3811de72c7a9b633178e9f75e89d546d69ee206c 100644 (file)
@@ -129,7 +129,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                .bucket_size    = le16_to_cpu(mi->bucket_size),
                .group          = BCH_MEMBER_GROUP(mi),
                .state          = BCH_MEMBER_STATE(mi),
-               .tier           = BCH_MEMBER_TIER(mi),
                .replacement    = BCH_MEMBER_REPLACEMENT(mi),
                .discard        = BCH_MEMBER_DISCARD(mi),
                .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
@@ -204,27 +203,34 @@ struct target {
        };
 };
 
+#define TARGET_DEV_START       1
+#define TARGET_GROUP_START     (256 + TARGET_DEV_START)
+
 static inline u16 dev_to_target(unsigned dev)
 {
-       return 1 + dev;
+       return TARGET_DEV_START + dev;
 }
 
 static inline u16 group_to_target(unsigned group)
 {
-       return 1 + U8_MAX + group;
+       return TARGET_GROUP_START + group;
 }
 
 static inline struct target target_decode(unsigned target)
 {
-       if (!target)
-               return (struct target) { .type = TARGET_NULL };
-
-       --target;
-       if (target <= U8_MAX)
-               return (struct target) { .type = TARGET_DEV, .dev = target };
-
-       target -= U8_MAX;
-       return (struct target) { .type = TARGET_GROUP, .group = target };
+       if (target >= TARGET_GROUP_START)
+               return (struct target) {
+                       .type   = TARGET_GROUP,
+                       .group  = target - TARGET_GROUP_START
+               };
+
+       if (target >= TARGET_DEV_START)
+               return (struct target) {
+                       .type   = TARGET_DEV,
+                       .group  = target - TARGET_DEV_START
+               };
+
+       return (struct target) { .type = TARGET_NULL };
 }
 
 static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
@@ -232,6 +238,8 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
        struct target t = target_decode(target);
 
        switch (t.type) {
+       case TARGET_NULL:
+               return false;
        case TARGET_DEV:
                return ca->dev_idx == t.dev;
        case TARGET_GROUP:
@@ -243,4 +251,9 @@ static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
 
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
 
+int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
 #endif /* _BCACHEFS_SUPER_IO_H */
index 58bcd7d1ee062c3390ad9874672c575c4910ffd3..abb971286cdddbbaa463663008d12d9fa13f54d6 100644 (file)
@@ -149,6 +149,7 @@ int bch2_congested(void *data, int bdi_bits)
        unsigned i;
        int ret = 0;
 
+       rcu_read_lock();
        if (bdi_bits & (1 << WB_sync_congested)) {
                /* Reads - check all devices: */
                for_each_readable_member(ca, c, i) {
@@ -160,12 +161,11 @@ int bch2_congested(void *data, int bdi_bits)
                        }
                }
        } else {
-               /* Writes prefer fastest tier: */
-               struct bch_tier *tier = READ_ONCE(c->fastest_tier);
-               struct bch_devs_mask *devs =
-                       tier ? &tier->devs : &c->rw_devs[BCH_DATA_USER];
+               unsigned target = READ_ONCE(c->opts.foreground_target);
+               const struct bch_devs_mask *devs = target
+                       ? bch2_target_to_mask(c, target)
+                       : &c->rw_devs[BCH_DATA_USER];
 
-               rcu_read_lock();
                for_each_member_device_rcu(ca, c, i, devs) {
                        bdi = ca->disk_sb.bdev->bd_bdi;
 
@@ -174,8 +174,8 @@ int bch2_congested(void *data, int bdi_bits)
                                break;
                        }
                }
-               rcu_read_unlock();
        }
+       rcu_read_unlock();
 
        return ret;
 }
@@ -185,9 +185,9 @@ int bch2_congested(void *data, int bdi_bits)
 /*
  * For startup/shutdown of RW stuff, the dependencies are:
  *
- * - foreground writes depend on copygc and tiering (to free up space)
+ * - foreground writes depend on copygc and rebalance (to free up space)
  *
- * - copygc and tiering depend on mark and sweep gc (they actually probably
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
  *   don't because they either reserve ahead of time or don't block if
  *   allocations fail, but allocations can require mark and sweep gc to run
  *   because of generation number wraparound)
@@ -225,7 +225,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        struct bch_dev *ca;
        unsigned i;
 
-       bch2_tiering_stop(c);
+       bch2_rebalance_stop(c);
 
        for_each_member_device(ca, c, i)
                bch2_copygc_stop(ca);
@@ -385,8 +385,8 @@ const char *bch2_fs_read_write(struct bch_fs *c)
                        goto err;
                }
 
-       err = "error starting tiering thread";
-       if (bch2_tiering_start(c))
+       err = "error starting rebalance thread";
+       if (bch2_rebalance_start(c))
                goto err;
 
        schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
@@ -531,7 +531,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 #undef BCH_TIME_STAT
 
        bch2_fs_allocator_init(c);
-       bch2_fs_tiering_init(c);
+       bch2_fs_rebalance_init(c);
        bch2_fs_quota_init(c);
 
        INIT_LIST_HEAD(&c->list);
@@ -555,8 +555,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->writeback_pages_max = (256 << 10) / PAGE_SIZE;
 
        c->copy_gc_enabled = 1;
-       c->tiering_enabled = 1;
-       c->tiering_percent = 10;
+       c->rebalance_enabled = 1;
+       c->rebalance_percent = 10;
 
        c->journal.write_time   = &c->journal_write_time;
        c->journal.delay_time   = &c->journal_delay_time;
@@ -626,7 +626,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_btree_cache_init(c) ||
            bch2_fs_encryption_init(c) ||
            bch2_fs_compress_init(c) ||
-           bch2_check_set_has_compressed_data(c, c->opts.compression) ||
            bch2_fs_fsio_init(c))
                goto err;
 
@@ -1216,6 +1215,8 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
        if (ca->mi.state == BCH_MEMBER_STATE_RW)
                bch2_dev_allocator_add(c, ca);
 
+       rebalance_wakeup(c);
+
        percpu_ref_reinit(&ca->io_ref);
        return 0;
 }
@@ -1340,9 +1341,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
        if (bch2_copygc_start(c, ca))
                return "error starting copygc thread";
 
-       if (bch2_tiering_start(c))
-               return "error starting tiering thread";
-
        return NULL;
 }
 
@@ -1350,6 +1348,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
                         enum bch_member_state new_state, int flags)
 {
        struct bch_sb_field_members *mi;
+       int ret = 0;
 
        if (ca->mi.state == new_state)
                return 0;
@@ -1368,10 +1367,13 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       if (new_state == BCH_MEMBER_STATE_RW)
-               return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0;
+       if (new_state == BCH_MEMBER_STATE_RW &&
+           __bch2_dev_read_write(c, ca))
+               ret = -ENOMEM;
 
-       return 0;
+       rebalance_wakeup(c);
+
+       return ret;
 }
 
 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1700,6 +1702,95 @@ err:
        return ret;
 }
 
+/* return with ref on ca->ref: */
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+{
+
+       struct block_device *bdev = lookup_bdev(path);
+       struct bch_dev *ca;
+       unsigned i;
+
+       if (IS_ERR(bdev))
+               return ERR_CAST(bdev);
+
+       for_each_member_device(ca, c, i)
+               if (ca->disk_sb.bdev == bdev)
+                       goto found;
+
+       ca = ERR_PTR(-ENOENT);
+found:
+       bdput(bdev);
+       return ca;
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label)
+{
+       struct bch_sb_field_disk_groups *groups;
+       struct bch_disk_group *g;
+       struct bch_member *mi;
+       unsigned i, v, nr_groups;
+       int ret;
+
+       if (strlen(label) > BCH_SB_LABEL_SIZE)
+               return -EINVAL;
+
+       mutex_lock(&c->sb_lock);
+       groups          = bch2_sb_get_disk_groups(c->disk_sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       if (!strcmp(label, "none")) {
+               v = 0;
+               goto write_sb;
+       }
+
+       ret = __bch2_disk_group_find(groups, label);
+       if (ret >= 0) {
+               v = ret + 1;
+               goto write_sb;
+       }
+
+       /* not found - create a new disk group: */
+
+       for (i = 0;
+            i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+            i++)
+               ;
+
+       if (i == nr_groups) {
+               unsigned u64s =
+                       (sizeof(struct bch_sb_field_disk_groups) +
+                        sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+                       sizeof(u64);
+
+               groups = bch2_fs_sb_resize_disk_groups(c, u64s);
+               if (!groups) {
+                       mutex_unlock(&c->sb_lock);
+                       return -ENOSPC;
+               }
+
+               nr_groups = disk_groups_nr(groups);
+       }
+
+       BUG_ON(i >= nr_groups);
+
+       g = &groups->entries[i];
+       v = i + 1;
+
+       memcpy(g->label, label, strlen(label));
+       if (strlen(label) < sizeof(g->label))
+               g->label[strlen(label)] = '\0';
+       SET_BCH_GROUP_DELETED(g, 0);
+       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+write_sb:
+       mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+       SET_BCH_MEMBER_GROUP(mi, v);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
 /* Filesystem open: */
 
 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
index 1718f5c103034acbe9f48222298d62ab2280c513..652a572ff329d73e478538bb891f743a861dcd9c 100644 (file)
@@ -194,6 +194,8 @@ int bch2_dev_add(struct bch_fs *, const char *);
 int bch2_dev_online(struct bch_fs *, const char *);
 int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
 int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
 
 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
index d76d917cb03986f2c892cf38d8efd6153c0dca76..3be05e9b08888032b7bf9455023cbbfc431de4b0 100644 (file)
@@ -24,7 +24,6 @@ struct bch_member_cpu {
        u16                     bucket_size;    /* sectors */
        u16                     group;
        u8                      state;
-       u8                      tier;
        u8                      replacement;
        u8                      discard;
        u8                      data_allowed;
index 2e958a8ef3817075eda22b8be1cdaccc8b9f005d..e42bc1dae336c8571e67917eccbd4b723ddafa44 100644 (file)
@@ -168,15 +168,14 @@ rw_attribute(writeback_pages_max);
 
 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);
+rw_attribute(group);
 
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
 
-rw_attribute(tier);
-rw_attribute(tiering_enabled);
-rw_attribute(tiering_percent);
-sysfs_pd_controller_attribute(tiering);
-
+rw_attribute(rebalance_enabled);
+rw_attribute(rebalance_percent);
+sysfs_pd_controller_attribute(rebalance);
 
 rw_attribute(pd_controllers_update_seconds);
 
@@ -332,10 +331,10 @@ SHOW(bch2_fs)
        sysfs_print(pd_controllers_update_seconds,
                    c->pd_controllers_update_seconds);
 
-       sysfs_printf(tiering_enabled,           "%i", c->tiering_enabled);
-       sysfs_print(tiering_percent,            c->tiering_percent);
+       sysfs_printf(rebalance_enabled,         "%i", c->rebalance_enabled);
+       sysfs_print(rebalance_percent,          c->rebalance_percent);
 
-       sysfs_pd_controller_show(tiering,       &c->tiers[1].pd); /* XXX */
+       sysfs_pd_controller_show(rebalance,     &c->rebalance_pd); /* XXX */
 
        sysfs_printf(meta_replicas_have, "%u",  bch2_replicas_online(c, true));
        sysfs_printf(data_replicas_have, "%u",  bch2_replicas_online(c, false));
@@ -397,19 +396,19 @@ STORE(__bch2_fs)
                return ret;
        }
 
-       if (attr == &sysfs_tiering_enabled) {
-               ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
+       if (attr == &sysfs_rebalance_enabled) {
+               ssize_t ret = strtoul_safe(buf, c->rebalance_enabled)
                        ?: (ssize_t) size;
 
-               bch2_tiering_start(c); /* issue wakeups */
+               rebalance_wakeup(c);
                return ret;
        }
 
        sysfs_strtoul(pd_controllers_update_seconds,
                      c->pd_controllers_update_seconds);
 
-       sysfs_strtoul(tiering_percent,          c->tiering_percent);
-       sysfs_pd_controller_store(tiering,      &c->tiers[1].pd); /* XXX */
+       sysfs_strtoul(rebalance_percent,        c->rebalance_percent);
+       sysfs_pd_controller_store(rebalance,    &c->rebalance_pd);
 
        /* Debugging: */
 
@@ -468,7 +467,7 @@ struct attribute *bch2_fs_files[] = {
 
        &sysfs_writeback_pages_max,
 
-       &sysfs_tiering_percent,
+       &sysfs_rebalance_percent,
 
        &sysfs_compression_stats,
        NULL
@@ -506,8 +505,8 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_prune_cache,
 
        &sysfs_copy_gc_enabled,
-       &sysfs_tiering_enabled,
-       sysfs_pd_controller_files(tiering),
+       &sysfs_rebalance_enabled,
+       sysfs_pd_controller_files(rebalance),
        &sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -527,9 +526,7 @@ SHOW(bch2_fs_opts_dir)
        int id = opt - bch2_opt_table;
        u64 v = bch2_opt_get_by_id(&c->opts, id);
 
-       out += opt->type == BCH_OPT_STR
-               ? bch2_scnprint_string_list(out, end - out, opt->choices, v)
-               : scnprintf(out, end - out, "%lli", v);
+       out += bch2_opt_to_text(c, out, end - out, opt, v, OPT_SHOW_FULL_LIST);
        out += scnprintf(out, end - out, "\n");
 
        return out - buf;
@@ -542,13 +539,12 @@ STORE(bch2_fs_opts_dir)
        int ret, id = opt - bch2_opt_table;
        u64 v;
 
-       ret = bch2_opt_parse(opt, buf, &v);
+       ret = bch2_opt_parse(c, opt, buf, &v);
        if (ret < 0)
                return ret;
 
-       mutex_lock(&c->sb_lock);
-
-       if (id == Opt_compression) {
+       if (id == Opt_compression ||
+           id == Opt_background_compression) {
                int ret = bch2_check_set_has_compressed_data(c, v);
                if (ret) {
                        mutex_unlock(&c->sb_lock);
@@ -557,13 +553,19 @@ STORE(bch2_fs_opts_dir)
        }
 
        if (opt->set_sb != SET_NO_SB_OPT) {
+               mutex_lock(&c->sb_lock);
                opt->set_sb(c->disk_sb, v);
                bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
        }
 
        bch2_opt_set_by_id(&c->opts, id, v);
 
-       mutex_unlock(&c->sb_lock);
+       if ((id == Opt_background_target ||
+            id == Opt_background_compression) && v) {
+               bch2_rebalance_add_work(c, S64_MAX);
+               rebalance_wakeup(c);
+       }
 
        return size;
 }
@@ -809,6 +811,26 @@ SHOW(bch2_dev)
        sysfs_print(nbuckets,           ca->mi.nbuckets);
        sysfs_print(discard,            ca->mi.discard);
 
+       if (attr == &sysfs_group) {
+               struct bch_sb_field_disk_groups *groups;
+               struct bch_disk_group *g;
+               unsigned len;
+
+               if (!ca->mi.group)
+                       return scnprintf(out, end - out, "none\n");
+
+               mutex_lock(&c->sb_lock);
+               groups = bch2_sb_get_disk_groups(c->disk_sb);
+
+               g = &groups->entries[ca->mi.group - 1];
+               len = strnlen(g->label, sizeof(g->label));
+               memcpy(buf, g->label, len);
+               mutex_unlock(&c->sb_lock);
+
+               buf[len++] = '\n';
+               return len;
+       }
+
        if (attr == &sysfs_has_data) {
                out += bch2_scnprint_flag_list(out, end - out,
                                               bch2_data_types,
@@ -827,8 +849,6 @@ SHOW(bch2_dev)
                return out - buf;
        }
 
-       sysfs_print(tier,               ca->mi.tier);
-
        if (attr == &sysfs_state_rw) {
                out += bch2_scnprint_string_list(out, end - out,
                                                 bch2_dev_state,
@@ -892,29 +912,10 @@ STORE(bch2_dev)
                mutex_unlock(&c->sb_lock);
        }
 
-       if (attr == &sysfs_tier) {
-               unsigned prev_tier;
-               unsigned v = strtoul_restrict_or_return(buf,
-                                       0, BCH_TIER_MAX - 1);
-
-               mutex_lock(&c->sb_lock);
-               prev_tier = ca->mi.tier;
-
-               if (v == ca->mi.tier) {
-                       mutex_unlock(&c->sb_lock);
-                       return size;
-               }
-
-               mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
-               SET_BCH_MEMBER_TIER(mi, v);
-               bch2_write_super(c);
-
-               clear_bit(ca->dev_idx, c->tiers[prev_tier].devs.d);
-               set_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
-               mutex_unlock(&c->sb_lock);
-
-               bch2_recalc_capacity(c);
-               bch2_tiering_start(c);
+       if (attr == &sysfs_group) {
+               int ret = bch2_dev_group_set(c, ca, buf);
+               if (ret)
+                       return ret;
        }
 
        if (attr == &sysfs_wake_allocator)
@@ -934,8 +935,8 @@ struct attribute *bch2_dev_files[] = {
        /* settings: */
        &sysfs_discard,
        &sysfs_cache_replacement_policy,
-       &sysfs_tier,
        &sysfs_state_rw,
+       &sysfs_group,
 
        &sysfs_has_data,
        &sysfs_iostats,
index 775c2e2be686d4d97c04f7a3a2f721106549a1a1..211a844c69cf91c009d7ea757e7d61fa9388b64b 100644 (file)
 
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/sched/cputime.h>
 #include <trace/events/bcachefs.h>
 
-static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier,
-                          struct bkey_s_c_extent e)
+static inline bool rebalance_ptr_pred(struct bch_fs *c,
+                                     const struct bch_extent_ptr *ptr,
+                                     struct bch_extent_crc_unpacked crc,
+                                     struct bch_io_opts *io_opts)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       if (io_opts->background_target &&
+           !dev_in_target(ca, io_opts->background_target) &&
+           !ptr->cached)
+               return true;
+
+       if (io_opts->background_compression &&
+           crc.compression_type !=
+           bch2_compression_opt_to_type[io_opts->background_compression])
+               return true;
+
+       return false;
+}
+
+void bch2_rebalance_add_key(struct bch_fs *c,
+                           struct bkey_s_c k,
+                           struct bch_io_opts *io_opts)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       struct bkey_s_c_extent e;
+
+       if (!bkey_extent_is_data(k.k))
+               return;
+
+       if (!io_opts->background_target &&
+           !io_opts->background_compression)
+               return;
+
+       e = bkey_s_c_to_extent(k);
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+                       if (!atomic64_add_return(crc.compressed_size,
+                                                &ca->rebalance_work))
+                               rebalance_wakeup(c);
+               }
+}
+
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+       if (!atomic64_add_return(sectors, &c->rebalance_work_unknown_dev))
+               rebalance_wakeup(c);
+}
+
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
+                                   enum bkey_type type,
+                                   struct bkey_s_c_extent e,
+                                   struct bch_io_opts *io_opts,
+                                   struct data_opts *data_opts)
 {
        const struct bch_extent_ptr *ptr;
-       unsigned replicas = 0;
+       struct bch_extent_crc_unpacked crc;
 
        /* Make sure we have room to add a new pointer: */
        if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
            BKEY_EXTENT_VAL_U64s_MAX)
-               return false;
+               return DATA_SKIP;
 
-       extent_for_each_ptr(e, ptr)
-               if (bch_dev_bkey_exists(c, ptr->dev)->mi.tier >= tier->idx)
-                       replicas++;
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (rebalance_ptr_pred(c, ptr, crc, io_opts))
+                       goto found;
 
-       return replicas < c->opts.data_replicas;
+       return DATA_SKIP;
+found:
+       data_opts->target               = io_opts->background_target;
+       data_opts->btree_insert_flags   = 0;
+       return DATA_ADD_REPLICAS;
 }
 
-static enum data_cmd tiering_pred(struct bch_fs *c, void *arg,
-                                 enum bkey_type type,
-                                 struct bkey_s_c_extent e,
-                                 struct bch_io_opts *io_opts,
-                                 struct data_opts *data_opts)
+struct rebalance_work {
+       unsigned        dev_most_full_percent;
+       u64             dev_most_full_work;
+       u64             dev_most_full_capacity;
+       u64             total_work;
+};
+
+static struct rebalance_work rebalance_work(struct bch_fs *c)
 {
-       struct bch_tier *tier = arg;
+       struct bch_dev *ca;
+       struct rebalance_work ret = { 0 };
+       unsigned i;
 
-       if (!__tiering_pred(c, tier, e))
-               return DATA_SKIP;
+       for_each_online_member(ca, c, i) {
+               u64 capacity = bucket_to_sector(ca, ca->mi.nbuckets -
+                                               ca->mi.first_bucket);
+               u64 work = atomic64_read(&ca->rebalance_work) +
+                       atomic64_read(&c->rebalance_work_unknown_dev);
+               unsigned percent_full = div_u64(work * 100, capacity);
+
+               if (percent_full > ret.dev_most_full_percent) {
+                       ret.dev_most_full_percent       = percent_full;
+                       ret.dev_most_full_work          = work;
+                       ret.dev_most_full_capacity      = capacity;
+               }
 
-       data_opts->btree_insert_flags = 0;
-       return DATA_ADD_REPLICAS;
+               ret.total_work += atomic64_read(&ca->rebalance_work);
+       }
+
+       ret.total_work += atomic64_read(&c->rebalance_work_unknown_dev);
+
+       return ret;
 }
 
-static int bch2_tiering_thread(void *arg)
+static void rebalance_work_reset(struct bch_fs *c)
 {
-       struct bch_tier *tier = arg;
-       struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
-       struct io_clock *clock = &c->io_clock[WRITE];
        struct bch_dev *ca;
-       struct bch_move_stats move_stats;
-       u64 tier_capacity, available_sectors;
-       unsigned long last;
-       unsigned i, nr_devices;
+       unsigned i;
+
+       for_each_online_member(ca, c, i)
+               atomic64_set(&ca->rebalance_work, 0);
+
+       atomic64_set(&c->rebalance_work_unknown_dev, 0);
+}
+
+static unsigned long curr_cputime(void)
+{
+       u64 utime, stime;
+
+       task_cputime_adjusted(current, &utime, &stime);
+       return nsecs_to_jiffies(utime + stime);
+}
+
+static int bch2_rebalance_thread(void *arg)
+{
+       struct bch_fs *c = arg;
+       struct io_clock *clock = &c->io_clock[WRITE];
+       struct rebalance_work w, p;
+       unsigned long start, prev_start;
+       unsigned long prev_run_time, prev_run_cputime;
+       unsigned long cputime, prev_cputime;
 
-       memset(&move_stats, 0, sizeof(move_stats));
        set_freezable();
 
-       while (!kthread_should_stop()) {
-               if (kthread_wait_freezable(c->tiering_enabled &&
-                                          (nr_devices = dev_mask_nr(&tier->devs))))
-                       break;
-
-               while (1) {
-                       struct bch_tier *faster_tier;
-
-                       last = atomic_long_read(&clock->now);
-
-                       tier_capacity = available_sectors = 0;
-                       for (faster_tier = c->tiers;
-                            faster_tier != tier;
-                            faster_tier++) {
-                               rcu_read_lock();
-                               for_each_member_device_rcu(ca, c, i,
-                                               &faster_tier->devs) {
-                                       tier_capacity +=
-                                               bucket_to_sector(ca,
-                                                       ca->mi.nbuckets -
-                                                       ca->mi.first_bucket);
-                                       available_sectors +=
-                                               bucket_to_sector(ca,
-                                                       dev_buckets_available(c, ca));
-                               }
-                               rcu_read_unlock();
-                       }
+       p               = rebalance_work(c);
+       prev_start      = jiffies;
+       prev_cputime    = curr_cputime();
+
+       while (!kthread_wait_freezable(c->rebalance_enabled)) {
+               struct bch_move_stats move_stats = { 0 };
 
-                       if (available_sectors < (tier_capacity >> 1))
-                               break;
+               w                       = rebalance_work(c);
+               start                   = jiffies;
+               cputime                 = curr_cputime();
+
+               prev_run_time           = start - prev_start;
+               prev_run_cputime        = cputime - prev_cputime;
+
+               if (!w.total_work) {
+                       kthread_wait_freezable(rebalance_work(c).total_work);
+                       continue;
+               }
 
-                       bch2_kthread_io_clock_wait(clock,
-                                                 last +
-                                                 available_sectors -
-                                                 (tier_capacity >> 1));
-                       if (kthread_should_stop())
-                               return 0;
+               if (w.dev_most_full_percent < 20 &&
+                   prev_run_cputime * 5 > prev_run_time) {
+                       if (w.dev_most_full_capacity) {
+                               bch2_kthread_io_clock_wait(clock,
+                                       atomic_long_read(&clock->now) +
+                                       div_u64(w.dev_most_full_capacity, 5));
+                       } else {
+
+                               set_current_state(TASK_INTERRUPTIBLE);
+                               if (kthread_should_stop())
+                                       break;
+
+                               schedule_timeout(prev_run_cputime * 5 -
+                                                prev_run_time);
+                               continue;
+                       }
                }
 
-               bch2_move_data(c, &tier->pd.rate,
-                              SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
-                              &tier->devs,
-                              writepoint_ptr(&tier->wp),
+               /* minimum 1 mb/sec: */
+               c->rebalance_pd.rate.rate =
+                       max_t(u64, 1 << 11,
+                             c->rebalance_pd.rate.rate *
+                             max(p.dev_most_full_percent, 1U) /
+                             max(w.dev_most_full_percent, 1U));
+
+               rebalance_work_reset(c);
+
+               bch2_move_data(c, &c->rebalance_pd.rate,
+                              writepoint_ptr(&c->rebalance_write_point),
                               POS_MIN, POS_MAX,
-                              tiering_pred, tier,
+                              rebalance_pred, NULL,
                               &move_stats);
        }
 
        return 0;
 }
 
-static void __bch2_tiering_stop(struct bch_tier *tier)
+void bch2_rebalance_stop(struct bch_fs *c)
 {
-       tier->pd.rate.rate = UINT_MAX;
-       bch2_ratelimit_reset(&tier->pd.rate);
-
-       if (tier->migrate)
-               kthread_stop(tier->migrate);
+       struct task_struct *p;
 
-       tier->migrate = NULL;
-}
+       c->rebalance_pd.rate.rate = UINT_MAX;
+       bch2_ratelimit_reset(&c->rebalance_pd.rate);
 
-void bch2_tiering_stop(struct bch_fs *c)
-{
-       struct bch_tier *tier;
+       p = c->rebalance_thread;
+       c->rebalance_thread = NULL;
 
-       for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
-               __bch2_tiering_stop(tier);
-}
+       if (p) {
+               /* for sychronizing with rebalance_wakeup() */
+               synchronize_rcu();
 
-static int __bch2_tiering_start(struct bch_tier *tier)
-{
-       if (!tier->migrate) {
-               struct task_struct *p =
-                       kthread_create(bch2_tiering_thread, tier,
-                                      "bch_tier[%u]", tier->idx);
-               if (IS_ERR(p))
-                       return PTR_ERR(p);
-
-               tier->migrate = p;
+               kthread_stop(p);
+               put_task_struct(p);
        }
-
-       wake_up_process(tier->migrate);
-       return 0;
 }
 
-int bch2_tiering_start(struct bch_fs *c)
+int bch2_rebalance_start(struct bch_fs *c)
 {
-       struct bch_tier *tier;
-       bool have_faster_tier = false;
+       struct task_struct *p;
 
        if (c->opts.nochanges)
                return 0;
 
-       for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
-               if (!dev_mask_nr(&tier->devs))
-                       continue;
-
-               if (have_faster_tier) {
-                       int ret = __bch2_tiering_start(tier);
-                       if (ret)
-                               return ret;
-               } else {
-                       __bch2_tiering_stop(tier);
-               }
+       p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
 
-               have_faster_tier = true;
-       }
+       get_task_struct(p);
 
+       rcu_assign_pointer(c->rebalance_thread, p);
+       wake_up_process(c->rebalance_thread);
        return 0;
 }
 
-void bch2_fs_tiering_init(struct bch_fs *c)
+void bch2_fs_rebalance_init(struct bch_fs *c)
 {
-       unsigned i;
+       bch2_pd_controller_init(&c->rebalance_pd);
 
-       for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
-               c->tiers[i].idx = i;
-               bch2_pd_controller_init(&c->tiers[i].pd);
-       }
+       atomic64_set(&c->rebalance_work_unknown_dev, S64_MAX);
 }
index f8eaa9b0e8c995cb4c1bb88234c4c4b98ebdb15e..0c66dfea7c0dd24974cf127d24059d647b0677ed 100644 (file)
@@ -1,8 +1,23 @@
 #ifndef _BCACHEFS_TIER_H
 #define _BCACHEFS_TIER_H
 
-void bch2_tiering_stop(struct bch_fs *);
-int bch2_tiering_start(struct bch_fs *);
-void bch2_fs_tiering_init(struct bch_fs *);
+static inline void rebalance_wakeup(struct bch_fs *c)
+{
+       struct task_struct *p;
+
+       rcu_read_lock();
+       p = rcu_dereference(c->rebalance_thread);
+       if (p)
+               wake_up_process(p);
+       rcu_read_unlock();
+}
+
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
+                           struct bch_io_opts *);
+void bch2_rebalance_add_work(struct bch_fs *, u64);
+
+void bch2_rebalance_stop(struct bch_fs *);
+int bch2_rebalance_start(struct bch_fs *);
+void bch2_fs_rebalance_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_TIER_H */
index 1d6cbe72e05b56e1ec83e15c754b00fb07e6d0c1..81e942e5039cb6372086166270361baa4d21f043 100644 (file)
@@ -6,6 +6,7 @@
 #include "extents.h"
 #include "fs.h"
 #include "str_hash.h"
+#include "tier.h"
 #include "xattr.h"
 
 #include <linux/dcache.h>
@@ -366,6 +367,7 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
                                   const char *name, void *buffer, size_t size)
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_opts opts =
                bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
        const struct bch_option *opt;
@@ -383,12 +385,9 @@ static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
 
        v = bch2_opt_get_by_id(&opts, id);
 
-       if (opt->type == BCH_OPT_STR)
-               ret = snprintf(buffer, size, "%s", opt->choices[v]);
-       else
-               ret = snprintf(buffer, size, "%llu", v);
+       ret = bch2_opt_to_text(c, buffer, size, opt, v, 0);
 
-       return ret <= size || !buffer ? ret : -ERANGE;
+       return ret < size || !buffer ? ret : -ERANGE;
 }
 
 struct inode_opt_set {
@@ -435,17 +434,15 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                memcpy(buf, value, size);
                buf[size] = '\0';
 
-               ret = bch2_opt_parse(opt, buf, &s.v);
+               ret = bch2_opt_parse(c, opt, buf, &s.v);
                kfree(buf);
 
                if (ret < 0)
                        return ret;
 
-               if (s.id == Opt_compression) {
-                       mutex_lock(&c->sb_lock);
+               if (s.id == Opt_compression ||
+                   s.id == Opt_background_compression) {
                        ret = bch2_check_set_has_compressed_data(c, s.v);
-                       mutex_unlock(&c->sb_lock);
-
                        if (ret)
                                return ret;
                }
@@ -459,6 +456,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
        ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s);
        mutex_unlock(&inode->ei_update_lock);
 
+       if (value &&
+           (s.id == Opt_background_compression ||
+            s.id == Opt_background_target))
+               bch2_rebalance_add_work(c, inode->v.i_blocks);
+
        return ret;
 }