]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 8bf4b038d4 bcachefs: Assorted fixes for running on very...
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 5 Nov 2018 03:18:23 +0000 (22:18 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 5 Nov 2018 03:21:00 +0000 (22:21 -0500)
14 files changed:
.bcachefs_revision
libbcachefs.c
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/recovery.c
libbcachefs/super-io.c
libbcachefs/super.c

index b6371345019d7e26fa0ad2bec1f3fbc89bd5e0b1..9f81e277c7049c2702dfe86eedb5e4a6c5ac1c9b 100644 (file)
@@ -1 +1 @@
-defaad6d47791d3e6285cba323f92847b6e4c226
+8bf4b038d41230504d3f0315a35e4d7a056e0a65
index 3ce69d1b788d6f4e0e1788d1a444bf9abbd80f64..c8738f408ccf2927beb66dd83b1ff0a009c282c7 100644 (file)
@@ -26,8 +26,6 @@
 
 #define NSEC_PER_SEC   1000000000L
 
-#define BCH_MIN_NR_NBUCKETS    (1 << 10)
-
 /* minimum size filesystem we can create, given a bucket size: */
 static u64 min_size(unsigned bucket_size)
 {
index c3efb4357cad110833c9c2350faac615707c6a72..9ff61deb1f5d50d5c07265cfb7c5f0bbd7708f59 100644 (file)
@@ -373,6 +373,11 @@ static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
        }
 }
 
+static inline u64 bucket_clock_freq(u64 capacity)
+{
+       return max(capacity >> 10, 2028ULL);
+}
+
 static void bch2_inc_clock_hand(struct io_timer *timer)
 {
        struct bucket_clock *clock = container_of(timer,
@@ -411,7 +416,7 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
         * RW mode (that will be 0 when we're RO, yet we can still service
         * reads)
         */
-       timer->expire += capacity >> 10;
+       timer->expire += bucket_clock_freq(capacity);
 
        bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 }
@@ -423,7 +428,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
        clock->hand             = 1;
        clock->rw               = rw;
        clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = c->capacity >> 10;
+       clock->rescale.expire   = bucket_clock_freq(c->capacity);
        mutex_init(&clock->lock);
 }
 
@@ -974,6 +979,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 {
        struct bch_dev *ca;
        u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+       unsigned bucket_size_max = 0;
        unsigned long ra_pages = 0;
        unsigned i, j;
 
@@ -1009,14 +1015,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                for (j = 0; j < RESERVE_NONE; j++)
                        dev_reserve += ca->free[j].size;
 
-               dev_reserve += ca->free_inc.size;
-
-               dev_reserve += ARRAY_SIZE(c->write_points);
-
                dev_reserve += 1;       /* btree write point */
                dev_reserve += 1;       /* copygc write point */
                dev_reserve += 1;       /* rebalance write point */
-               dev_reserve += WRITE_POINT_COUNT;
 
                dev_reserve *= ca->mi.bucket_size;
 
@@ -1026,6 +1027,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                                             ca->mi.first_bucket);
 
                reserved_sectors += dev_reserve * 2;
+
+               bucket_size_max = max_t(unsigned, bucket_size_max,
+                                       ca->mi.bucket_size);
        }
 
        gc_reserve = c->opts.gc_reserve_bytes
@@ -1038,6 +1042,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
        c->capacity = capacity - reserved_sectors;
 
+       c->bucket_size_max = bucket_size_max;
+
        if (c->capacity) {
                bch2_io_timer_add(&c->io_clock[READ],
                                 &c->bucket_clock[READ].rescale);
@@ -1329,8 +1335,6 @@ not_enough:
         * invalidated on disk:
         */
        if (invalidating_data) {
-               BUG();
-               pr_info("holding writes");
                pr_debug("invalidating existing data");
                set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
        } else {
@@ -1390,40 +1394,12 @@ int bch2_fs_allocator_start(struct bch_fs *c)
        return bch2_alloc_write(c);
 }
 
-void bch2_fs_allocator_init(struct bch_fs *c)
+void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
-       struct open_bucket *ob;
-       struct write_point *wp;
-
-       mutex_init(&c->write_points_hash_lock);
        spin_lock_init(&c->freelist_lock);
        bch2_bucket_clock_init(c, READ);
        bch2_bucket_clock_init(c, WRITE);
 
-       /* open bucket 0 is a sentinal NULL: */
-       spin_lock_init(&c->open_buckets[0].lock);
-
-       for (ob = c->open_buckets + 1;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-               spin_lock_init(&ob->lock);
-               c->open_buckets_nr_free++;
-
-               ob->freelist = c->open_buckets_freelist;
-               c->open_buckets_freelist = ob - c->open_buckets;
-       }
-
-       writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-       writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
-       for (wp = c->write_points;
-            wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-               writepoint_init(wp, BCH_DATA_USER);
-
-               wp->last_used   = sched_clock();
-               wp->write_point = (unsigned long) wp;
-               hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-       }
-
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
 }
index b5dbf7eb7c8295a936e910a4d03e9bea8ea01a63..ea07705bd173e933bd64419294f5e0dca73a28bd 100644 (file)
@@ -5,7 +5,7 @@
 #include "alloc_types.h"
 #include "debug.h"
 
-#define ALLOC_SCAN_BATCH(ca)           ((ca)->mi.nbuckets >> 9)
+#define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
 int bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
@@ -56,6 +56,6 @@ int bch2_dev_allocator_start(struct bch_dev *);
 
 int bch2_alloc_write(struct bch_fs *);
 int bch2_fs_allocator_start(struct bch_fs *);
-void bch2_fs_allocator_init(struct bch_fs *);
+void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index be94196eb2d0ab61bf83a708c6db611823968f9d..06859960d906910daf09f14b33e78aeac1b8dcf0 100644 (file)
@@ -491,7 +491,7 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
 
        mutex_lock(&wp->lock);
        open_bucket_for_each(c, &wp->ptrs, ob, i)
-               if (ob->ptr.dev == ca->dev_idx)
+               if (!ca || ob->ptr.dev == ca->dev_idx)
                        open_bucket_free_unused(c, wp, ob);
                else
                        ob_push(c, &ptrs, ob);
@@ -500,6 +500,15 @@ void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
        mutex_unlock(&wp->lock);
 }
 
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
+                                                unsigned long write_point)
+{
+       unsigned hash =
+               hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
+
+       return &c->write_points_hash[hash];
+}
+
 static struct write_point *__writepoint_find(struct hlist_head *head,
                                             unsigned long write_point)
 {
@@ -512,6 +521,53 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
        return NULL;
 }
 
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
+{
+       u64 stranded    = c->write_points_nr * c->bucket_size_max;
+       u64 free        = bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
+
+       return stranded * factor > free;
+}
+
+static bool try_increase_writepoints(struct bch_fs *c)
+{
+       struct write_point *wp;
+
+       if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
+           too_many_writepoints(c, 32))
+               return false;
+
+       wp = c->write_points + c->write_points_nr++;
+       hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
+       return true;
+}
+
+static bool try_decrease_writepoints(struct bch_fs *c,
+                                    unsigned old_nr)
+{
+       struct write_point *wp;
+
+       mutex_lock(&c->write_points_hash_lock);
+       if (c->write_points_nr < old_nr) {
+               mutex_unlock(&c->write_points_hash_lock);
+               return true;
+       }
+
+       if (c->write_points_nr == 1 ||
+           !too_many_writepoints(c, 8)) {
+               mutex_unlock(&c->write_points_hash_lock);
+               return false;
+       }
+
+       wp = c->write_points + --c->write_points_nr;
+
+       hlist_del_rcu(&wp->node);
+       mutex_unlock(&c->write_points_hash_lock);
+
+       bch2_writepoint_stop(c, NULL, wp);
+       return true;
+}
+
 static struct write_point *writepoint_find(struct bch_fs *c,
                                           unsigned long write_point)
 {
@@ -535,16 +591,22 @@ lock_wp:
                mutex_unlock(&wp->lock);
                goto restart_find;
        }
-
+restart_find_oldest:
        oldest = NULL;
        for (wp = c->write_points;
-            wp < c->write_points + ARRAY_SIZE(c->write_points);
-            wp++)
+            wp < c->write_points + c->write_points_nr; wp++)
                if (!oldest || time_before64(wp->last_used, oldest->last_used))
                        oldest = wp;
 
        mutex_lock(&oldest->lock);
        mutex_lock(&c->write_points_hash_lock);
+       if (oldest >= c->write_points + c->write_points_nr ||
+           try_increase_writepoints(c)) {
+               mutex_unlock(&c->write_points_hash_lock);
+               mutex_unlock(&oldest->lock);
+               goto restart_find_oldest;
+       }
+
        wp = __writepoint_find(head, write_point);
        if (wp && wp != oldest) {
                mutex_unlock(&c->write_points_hash_lock);
@@ -580,10 +642,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
        unsigned nr_effective = 0;
        struct open_buckets ptrs = { .nr = 0 };
        bool have_cache = false;
+       unsigned write_points_nr;
        int ret = 0, i;
 
        BUG_ON(!nr_replicas || !nr_replicas_required);
-
+retry:
+       write_points_nr = c->write_points_nr;
        wp = writepoint_find(c, write_point.v);
 
        if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -636,6 +700,11 @@ err:
        wp->ptrs = ptrs;
 
        mutex_unlock(&wp->lock);
+
+       if (ret == -ENOSPC &&
+           try_decrease_writepoints(c, write_points_nr))
+               goto retry;
+
        return ERR_PTR(ret);
 }
 
@@ -687,3 +756,37 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 
        bch2_open_buckets_put(c, &ptrs);
 }
+
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
+{
+       struct open_bucket *ob;
+       struct write_point *wp;
+
+       mutex_init(&c->write_points_hash_lock);
+       c->write_points_nr = ARRAY_SIZE(c->write_points);
+
+       /* open bucket 0 is a sentinal NULL: */
+       spin_lock_init(&c->open_buckets[0].lock);
+
+       for (ob = c->open_buckets + 1;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
+               spin_lock_init(&ob->lock);
+               c->open_buckets_nr_free++;
+
+               ob->freelist = c->open_buckets_freelist;
+               c->open_buckets_freelist = ob - c->open_buckets;
+       }
+
+       writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+       writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
+
+       for (wp = c->write_points;
+            wp < c->write_points + c->write_points_nr; wp++) {
+               writepoint_init(wp, BCH_DATA_USER);
+
+               wp->last_used   = sched_clock();
+               wp->write_point = (unsigned long) wp;
+               hlist_add_head_rcu(&wp->node,
+                                  writepoint_hash(c, wp->write_point));
+       }
+}
index ae9844b5a6e7cf007bb579b1f2edf78e53d106ae..729afc922b7f869b09580c9cff6112a853a320dc 100644 (file)
@@ -90,15 +90,6 @@ void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
                          struct write_point *);
 
-static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
-                                                unsigned long write_point)
-{
-       unsigned hash =
-               hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
-
-       return &c->write_points_hash[hash];
-}
-
 static inline struct write_point_specifier writepoint_hashed(unsigned long v)
 {
        return (struct write_point_specifier) { .v = v | 1 };
@@ -116,4 +107,6 @@ static inline void writepoint_init(struct write_point *wp,
        wp->type = type;
 }
 
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
index 94c041d2f3294f314e27b426d4ff77cefd1fade8..110663ffc17052786d00412a3e92acc7066595ea 100644 (file)
@@ -45,7 +45,9 @@ typedef FIFO(long)    alloc_fifo;
 
 /* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
 #define OPEN_BUCKETS_COUNT     256
-#define WRITE_POINT_COUNT      32
+
+#define WRITE_POINT_HASH_NR    32
+#define WRITE_POINT_MAX                32
 
 struct open_bucket {
        spinlock_t              lock;
index 6d5c7d6b848413567d84ccc9692390d5f73bdb4f..e23f45e88e96a11c77f91dbc82273c2eb154b818 100644 (file)
@@ -322,7 +322,7 @@ enum bch_time_stats {
 #define BTREE_RESERVE_MAX      (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
 
 /* Size of the freelist we allocate btree nodes from: */
-#define BTREE_NODE_RESERVE     (BTREE_RESERVE_MAX * 4)
+#define BTREE_NODE_RESERVE     BTREE_RESERVE_MAX
 
 struct btree;
 
@@ -598,6 +598,7 @@ struct bch_fs {
         * and forces them to be revalidated
         */
        u32                     capacity_gen;
+       unsigned                bucket_size_max;
 
        atomic64_t              sectors_available;
 
@@ -627,9 +628,10 @@ struct bch_fs {
        struct write_point      btree_write_point;
        struct write_point      rebalance_write_point;
 
-       struct write_point      write_points[WRITE_POINT_COUNT];
-       struct hlist_head       write_points_hash[WRITE_POINT_COUNT];
+       struct write_point      write_points[WRITE_POINT_MAX];
+       struct hlist_head       write_points_hash[WRITE_POINT_HASH_NR];
        struct mutex            write_points_hash_lock;
+       unsigned                write_points_nr;
 
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
index cdf392b39bb8692a32c504811851b5afe2547629..7ad080bfff318f0b4ef3e71e279364132bcca196 100644 (file)
@@ -904,6 +904,8 @@ struct bch_sb_field_journal {
 
 /* BCH_SB_FIELD_members: */
 
+#define BCH_MIN_NR_NBUCKETS    (1 << 6)
+
 struct bch_member {
        uuid_le                 uuid;
        __le64                  nbuckets;       /* device size */
@@ -1381,7 +1383,7 @@ struct jset {
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
 LE32_BITMASK(JSET_BIG_ENDIAN,  struct jset, flags, 4, 5);
 
-#define BCH_JOURNAL_BUCKETS_MIN                20
+#define BCH_JOURNAL_BUCKETS_MIN                8
 
 /* Btree: */
 
index 271c02f1a5a70be8c05dd054246320679461b59f..15a07e36d5cb72e57540884396b3e2a34096e254 100644 (file)
@@ -299,11 +299,6 @@ u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
        return min(c->capacity, __bch2_fs_sectors_used(c, stats));
 }
 
-static u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
-{
-       return c->capacity - bch2_fs_sectors_used(c, stats);
-}
-
 static inline int is_unavailable_bucket(struct bucket_mark m)
 {
        return !is_available_bucket(m);
@@ -883,9 +878,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
                             ca->mi.bucket_size / c->opts.btree_node_size);
        /* XXX: these should be tunable */
-       size_t reserve_none     = max_t(size_t, 4, nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 16, nbuckets >> 7);
-       size_t free_inc_nr      = max(max_t(size_t, 16, nbuckets >> 12),
+       size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
+       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
+       size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
                                      btree_reserve);
        bool resize = ca->buckets != NULL,
             start_copygc = ca->copygc_thread != NULL;
index d9fe938af4b4f61e1bf1a09d0b3491ef1b2a2efc..17b82cd07dbaf5acd4c8f6b4391f563c7aa03371 100644 (file)
@@ -174,6 +174,12 @@ void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
+static inline u64 bch2_fs_sectors_free(struct bch_fs *c,
+                                      struct bch_fs_usage stats)
+{
+       return c->capacity - bch2_fs_sectors_used(c, stats);
+}
+
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
        return (!mark.owned_by_allocator &&
index 902f39f659c2ea185e24f6256f4d647cd323cb35..f530f2028deb6ac59788c015ebf19064f0be6342 100644 (file)
@@ -277,7 +277,7 @@ out:
        return ret;
 err:
 fsck_err:
-       BUG_ON(!ret);
+       pr_err("Error in recovery: %s (%i)", err, ret);
        goto out;
 }
 
@@ -380,6 +380,6 @@ int bch2_fs_initialize(struct bch_fs *c)
 
        return 0;
 err:
-       BUG_ON(!ret);
+       pr_err("Error initializing new filesystem: %s (%i)", err, ret);
        return ret;
 }
index 54de9fac6e2283e63487a691f18523ba89522f24..8ef5db3dc023b76f900b5cb99086e657c5b1c6ed 100644 (file)
@@ -808,7 +808,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
                        return "Too many buckets";
 
                if (le64_to_cpu(m->nbuckets) -
-                   le16_to_cpu(m->first_bucket) < 1 << 10)
+                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
                        return "Not enough buckets";
 
                if (le16_to_cpu(m->bucket_size) <
index be28d40f9b4153e42ae68e8f1ae60e3a1a2d146a..b7a6f5fb88a86860d32c049d854675c7817dd7f3 100644 (file)
@@ -556,7 +556,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_init(&c->times[i]);
 
-       bch2_fs_allocator_init(c);
+       bch2_fs_allocator_background_init(c);
+       bch2_fs_allocator_foreground_init(c);
        bch2_fs_rebalance_init(c);
        bch2_fs_quota_init(c);