]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/alloc_background.c
Update bcachefs sources to 62de7539dc bcachefs: Make bkey types globally unique
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
index c3efb4357cad110833c9c2350faac615707c6a72..2e2fb99ed0d3c17d394562cd8200f283a17e39b4 100644 (file)
@@ -9,6 +9,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "journal_io.h"
 
@@ -74,36 +75,25 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 
 const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+
        if (k.k->p.inode >= c->sb.nr_devices ||
            !c->devs[k.k->p.inode])
                return "invalid device";
 
-       switch (k.k->type) {
-       case BCH_ALLOC: {
-               struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-
-               if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
-                       return "incorrect value size";
-               break;
-       }
-       default:
-               return "invalid type";
-       }
+       /* allow for unknown fields */
+       if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
+               return "incorrect value size";
 
        return NULL;
 }
 
-int bch2_alloc_to_text(struct bch_fs *c, char *buf,
-                      size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+                       struct bkey_s_c k)
 {
-       buf[0] = '\0';
+       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-       switch (k.k->type) {
-       case BCH_ALLOC:
-               break;
-       }
-
-       return 0;
+       pr_buf(out, "gen %u", a.v->gen);
 }
 
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
@@ -155,7 +145,7 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
        struct bucket *g;
        const u8 *d;
 
-       if (k.k->type != BCH_ALLOC)
+       if (k.k->type != KEY_TYPE_alloc)
                return;
 
        a = bkey_s_c_to_alloc(k);
@@ -235,6 +225,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
        __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
        struct bucket *g;
        struct bkey_i_alloc *a;
+       int ret;
        u8 *d;
 
        percpu_down_read_preempt_disable(&c->usage_lock);
@@ -258,32 +249,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
        bch2_btree_iter_set_pos(iter, a->k.p);
 
-       return bch2_btree_insert_at(c, NULL, journal_seq,
-                                   BTREE_INSERT_NOFAIL|
-                                   BTREE_INSERT_USE_RESERVE|
-                                   BTREE_INSERT_USE_ALLOC_RESERVE|
-                                   flags,
-                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
+       ret = bch2_btree_insert_at(c, NULL, journal_seq,
+                                  BTREE_INSERT_NOFAIL|
+                                  BTREE_INSERT_USE_RESERVE|
+                                  BTREE_INSERT_USE_ALLOC_RESERVE|
+                                  flags,
+                                  BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+       if (!ret && ca->buckets_written)
+               set_bit(b, ca->buckets_written);
+
+       return ret;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
        struct bch_dev *ca;
        struct btree_iter iter;
        int ret;
 
-       if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+       if (k->k.p.inode >= c->sb.nr_devices ||
+           !c->devs[k->k.p.inode])
                return 0;
 
-       ca = bch_dev_bkey_exists(c, pos.inode);
+       ca = bch_dev_bkey_exists(c, k->k.p.inode);
 
-       if (pos.offset >= ca->mi.nbuckets)
+       if (k->k.p.offset >= ca->mi.nbuckets)
                return 0;
 
-       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
+                            BTREE_ITER_INTENT);
 
-       ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto err;
+
+       /* check buckets_written with btree node locked: */
+
+       ret = test_bit(k->k.p.offset, ca->buckets_written)
+               ? 0
+               : bch2_btree_insert_at(c, NULL, NULL,
+                                      BTREE_INSERT_NOFAIL|
+                                      BTREE_INSERT_JOURNAL_REPLAY,
+                                      BTREE_INSERT_ENTRY(&iter, k));
+err:
        bch2_btree_iter_unlock(&iter);
        return ret;
 }
@@ -373,6 +382,11 @@ static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
        }
 }
 
+static inline u64 bucket_clock_freq(u64 capacity)
+{
+       return max(capacity >> 10, 2028ULL);
+}
+
 static void bch2_inc_clock_hand(struct io_timer *timer)
 {
        struct bucket_clock *clock = container_of(timer,
@@ -411,7 +425,7 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
         * RW mode (that will be 0 when we're RO, yet we can still service
         * reads)
         */
-       timer->expire += capacity >> 10;
+       timer->expire += bucket_clock_freq(capacity);
 
        bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 }
@@ -423,7 +437,7 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
        clock->hand             = 1;
        clock->rw               = rw;
        clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = c->capacity >> 10;
+       clock->rescale.expire   = bucket_clock_freq(c->capacity);
        mutex_init(&clock->lock);
 }
 
@@ -904,12 +918,6 @@ static int bch2_allocator_thread(void *arg)
                pr_debug("free_inc now empty");
 
                do {
-                       if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-                               up_read(&c->gc_lock);
-                               bch_err(ca, "gc failure");
-                               goto stop;
-                       }
-
                        /*
                         * Find some buckets that we can invalidate, either
                         * they're completely unused, or only contain clean data
@@ -974,6 +982,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
 {
        struct bch_dev *ca;
        u64 capacity = 0, reserved_sectors = 0, gc_reserve;
+       unsigned bucket_size_max = 0;
        unsigned long ra_pages = 0;
        unsigned i, j;
 
@@ -1009,14 +1018,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                for (j = 0; j < RESERVE_NONE; j++)
                        dev_reserve += ca->free[j].size;
 
-               dev_reserve += ca->free_inc.size;
-
-               dev_reserve += ARRAY_SIZE(c->write_points);
-
                dev_reserve += 1;       /* btree write point */
                dev_reserve += 1;       /* copygc write point */
                dev_reserve += 1;       /* rebalance write point */
-               dev_reserve += WRITE_POINT_COUNT;
 
                dev_reserve *= ca->mi.bucket_size;
 
@@ -1026,6 +1030,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                                             ca->mi.first_bucket);
 
                reserved_sectors += dev_reserve * 2;
+
+               bucket_size_max = max_t(unsigned, bucket_size_max,
+                                       ca->mi.bucket_size);
        }
 
        gc_reserve = c->opts.gc_reserve_bytes
@@ -1038,6 +1045,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
 
        c->capacity = capacity - reserved_sectors;
 
+       c->bucket_size_max = bucket_size_max;
+
        if (c->capacity) {
                bch2_io_timer_add(&c->io_clock[READ],
                                 &c->bucket_clock[READ].rescale);
@@ -1106,6 +1115,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
        }
        mutex_unlock(&c->btree_reserve_cache_lock);
 
+       while (1) {
+               struct open_bucket *ob;
+
+               spin_lock(&c->freelist_lock);
+               if (!ca->open_buckets_partial_nr) {
+                       spin_unlock(&c->freelist_lock);
+                       break;
+               }
+               ob = c->open_buckets +
+                       ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+               ob->on_partial_list = false;
+               spin_unlock(&c->freelist_lock);
+
+               bch2_open_bucket_put(c, ob);
+       }
+
+       bch2_ec_stop_dev(c, ca);
+
        /*
         * Wake up threads that were blocked on allocation, so they can notice
         * the device can no longer be removed and the capacity has changed:
@@ -1248,9 +1275,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
        bool invalidating_data = false;
        int ret = 0;
 
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return -1;
-
        if (test_alloc_startup(c)) {
                invalidating_data = true;
                goto not_enough;
@@ -1258,51 +1282,47 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 
        /* Scan for buckets that are already invalidated: */
        for_each_rw_member(ca, c, dev_iter) {
-               struct btree_iter iter;
+               struct bucket_array *buckets;
                struct bucket_mark m;
-               struct bkey_s_c k;
 
-               for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-                       if (k.k->type != BCH_ALLOC)
-                               continue;
+               down_read(&ca->bucket_lock);
+               percpu_down_read_preempt_disable(&c->usage_lock);
 
-                       bu = k.k->p.offset;
-                       m = READ_ONCE(bucket(ca, bu)->mark);
+               buckets = bucket_array(ca);
 
-                       if (!is_available_bucket(m) || m.cached_sectors)
+               for (bu = buckets->first_bucket;
+                    bu < buckets->nbuckets; bu++) {
+                       m = READ_ONCE(buckets->b[bu].mark);
+
+                       if (!m.gen_valid ||
+                           !is_available_bucket(m) ||
+                           m.cached_sectors)
                                continue;
 
-                       percpu_down_read_preempt_disable(&c->usage_lock);
                        bch2_mark_alloc_bucket(c, ca, bu, true,
-                                       gc_pos_alloc(c, NULL),
-                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
-                       percpu_up_read_preempt_enable(&c->usage_lock);
+                                       gc_pos_alloc(c, NULL), 0);
 
                        fifo_push(&ca->free_inc, bu);
 
-                       if (fifo_full(&ca->free_inc))
+                       discard_invalidated_buckets(c, ca);
+
+                       if (fifo_full(&ca->free[RESERVE_BTREE]))
                                break;
                }
-               bch2_btree_iter_unlock(&iter);
+               percpu_up_read_preempt_enable(&c->usage_lock);
+               up_read(&ca->bucket_lock);
        }
 
        /* did we find enough buckets? */
        for_each_rw_member(ca, c, dev_iter)
-               if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+               if (!fifo_full(&ca->free[RESERVE_BTREE])) {
                        percpu_ref_put(&ca->io_ref);
                        goto not_enough;
                }
 
        return 0;
 not_enough:
-       pr_debug("did not find enough empty buckets; issuing discards");
-
-       /* clear out free_inc, we'll be using it again below: */
-       for_each_rw_member(ca, c, dev_iter)
-               discard_invalidated_buckets(c, ca);
-
-       pr_debug("scanning for reclaimable buckets");
+       pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
        for_each_rw_member(ca, c, dev_iter) {
                find_reclaimable_buckets(c, ca);
@@ -1329,8 +1349,6 @@ not_enough:
         * invalidated on disk:
         */
        if (invalidating_data) {
-               BUG();
-               pr_info("holding writes");
                pr_debug("invalidating existing data");
                set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
        } else {
@@ -1390,40 +1408,12 @@ int bch2_fs_allocator_start(struct bch_fs *c)
        return bch2_alloc_write(c);
 }
 
-void bch2_fs_allocator_init(struct bch_fs *c)
+void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
-       struct open_bucket *ob;
-       struct write_point *wp;
-
-       mutex_init(&c->write_points_hash_lock);
        spin_lock_init(&c->freelist_lock);
        bch2_bucket_clock_init(c, READ);
        bch2_bucket_clock_init(c, WRITE);
 
-       /* open bucket 0 is a sentinal NULL: */
-       spin_lock_init(&c->open_buckets[0].lock);
-
-       for (ob = c->open_buckets + 1;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
-               spin_lock_init(&ob->lock);
-               c->open_buckets_nr_free++;
-
-               ob->freelist = c->open_buckets_freelist;
-               c->open_buckets_freelist = ob - c->open_buckets;
-       }
-
-       writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
-       writepoint_init(&c->rebalance_write_point, BCH_DATA_USER);
-
-       for (wp = c->write_points;
-            wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-               writepoint_init(wp, BCH_DATA_USER);
-
-               wp->last_used   = sched_clock();
-               wp->write_point = (unsigned long) wp;
-               hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-       }
-
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
 }