]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/movinggc.c
Update bcachefs sources to 62de7539dc bcachefs: Make bkey types globally unique
[bcachefs-tools-debian] / libbcachefs / movinggc.c
index 125159ee5ec269ba93bfcb22f5cbb3b6626981fd..d6890824912d436088eb55e886e75e5659b4d0e9 100644 (file)
@@ -5,9 +5,12 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_foreground.h"
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
+#include "disk_groups.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include "io.h"
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
+#include <linux/sched/task.h>
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-/* Moving GC - IO loop */
-
-static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
-{
-       const struct bucket_heap_entry *l = _l;
-       const struct bucket_heap_entry *r = _r;
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)                                    \
+       ((ca)->free[RESERVE_MOVINGGC].size / 2)
 
-       if (l->bucket < r->bucket)
-               return -1;
-       if (l->bucket > r->bucket)
-               return 1;
-       return 0;
-}
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)                                    \
+       ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
 
-static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
-                                               struct bkey_s_c k)
+static inline int sectors_used_cmp(copygc_heap *heap,
+                                  struct copygc_heap_entry l,
+                                  struct copygc_heap_entry r)
 {
-       bucket_heap *h = &ca->copygc_heap;
-       const struct bch_extent_ptr *ptr;
-
-       if (bkey_extent_is_data(k.k) &&
-           (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
-                                         ca->dev_idx))) {
-               struct bucket_heap_entry search = {
-                       .bucket = PTR_BUCKET_NR(ca, ptr)
-               };
-
-               size_t i = eytzinger0_find(h->data, h->used,
-                                          sizeof(h->data[0]),
-                                          bucket_idx_cmp, &search);
-
-               if (i < h->used)
-                       return ptr;
-       }
-
-       return NULL;
+       return (l.sectors > r.sectors) - (l.sectors < r.sectors);
 }
 
-static int issue_moving_gc_move(struct bch_dev *ca,
-                               struct moving_context *ctxt,
-                               struct bkey_s_c k)
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
-       struct bch_fs *c = ca->fs;
-       const struct bch_extent_ptr *ptr;
-       int ret;
-
-       ptr = moving_pred(ca, k);
-       if (!ptr) /* We raced - bucket's been reused */
-               return 0;
+       const struct copygc_heap_entry *l = _l;
+       const struct copygc_heap_entry *r = _r;
 
-       ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
-       if (!ret)
-               trace_gc_copy(k.k);
-       else
-               trace_moving_gc_alloc_fail(c, k.k->size);
-       return ret;
+       return (l->offset > r->offset) - (l->offset < r->offset);
 }
 
-static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
-                       u64 sectors_to_move)
+static bool __copygc_pred(struct bch_dev *ca,
+                         struct bkey_s_c k)
 {
-       struct bch_fs *c = ca->fs;
-       bucket_heap *h = &ca->copygc_heap;
-       struct moving_context ctxt;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 sectors_not_moved = 0;
-       size_t buckets_not_moved = 0;
-       struct bucket_heap_entry *i;
-
-       bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
-       bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
-                               SECTORS_IN_FLIGHT_PER_DEVICE);
-       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-                            BTREE_ITER_PREFETCH);
-
-       while (1) {
-               if (kthread_should_stop())
-                       goto out;
-               if (bch2_move_ctxt_wait(&ctxt))
-                       goto out;
-               k = bch2_btree_iter_peek(&iter);
-               if (!k.k)
-                       break;
-               if (btree_iter_err(k))
-                       goto out;
+       copygc_heap *h = &ca->copygc_heap;
 
-               if (!moving_pred(ca, k))
-                       goto next;
+       switch (k.k->type) {
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const struct bch_extent_ptr *ptr =
+                       bch2_extent_has_device(e, ca->dev_idx);
 
-               if (issue_moving_gc_move(ca, &ctxt, k)) {
-                       bch2_btree_iter_unlock(&iter);
+               if (ptr) {
+                       struct copygc_heap_entry search = { .offset = ptr->offset };
 
-                       /* memory allocation failure, wait for some IO to finish */
-                       bch2_move_ctxt_wait_for_io(&ctxt);
-                       continue;
-               }
-next:
-               bch2_btree_iter_advance_pos(&iter);
-               //bch2_btree_iter_cond_resched(&iter);
+                       ssize_t i = eytzinger0_find_le(h->data, h->used,
+                                                      sizeof(h->data[0]),
+                                                      bucket_offset_cmp, &search);
 
-               /* unlock before calling moving_context_wait() */
-               bch2_btree_iter_unlock(&iter);
-               cond_resched();
+                       return (i >= 0 &&
+                               ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+                               ptr->gen == h->data[i].gen);
+               }
+               break;
+       }
        }
 
-       bch2_btree_iter_unlock(&iter);
-       bch2_move_ctxt_exit(&ctxt);
-       trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-                                  buckets_to_move);
+       return false;
+}
 
-       /* don't check this if we bailed out early: */
-       for (i = h->data; i < h->data + h->used; i++) {
-               struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+                                struct bkey_s_c k,
+                                struct bch_io_opts *io_opts,
+                                struct data_opts *data_opts)
+{
+       struct bch_dev *ca = arg;
 
-               if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
-                       sectors_not_moved += bucket_sectors_used(m);
-                       buckets_not_moved++;
-               }
-       }
+       if (!__copygc_pred(ca, k))
+               return DATA_SKIP;
 
-       if (sectors_not_moved)
-               bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
-                        sectors_not_moved, sectors_to_move,
-                        buckets_not_moved, buckets_to_move);
-       return;
-out:
-       bch2_btree_iter_unlock(&iter);
-       bch2_move_ctxt_exit(&ctxt);
-       trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-                                  buckets_to_move);
+       data_opts->target               = dev_to_target(ca->dev_idx);
+       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
+       data_opts->rewrite_dev          = ca->dev_idx;
+       return DATA_REWRITE;
 }
 
 static bool have_copygc_reserve(struct bch_dev *ca)
@@ -161,45 +114,26 @@ static bool have_copygc_reserve(struct bch_dev *ca)
        bool ret;
 
        spin_lock(&ca->freelist_lock);
-       ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
-               COPYGC_BUCKETS_PER_ITER(ca);
+       ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
+               ca->allocator_blocked;
        spin_unlock(&ca->freelist_lock);
 
        return ret;
 }
 
-static inline int sectors_used_cmp(bucket_heap *heap,
-                                  struct bucket_heap_entry l,
-                                  struct bucket_heap_entry r)
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 {
-       return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
-}
-
-static void bch2_moving_gc(struct bch_dev *ca)
-{
-       struct bch_fs *c = ca->fs;
-       struct bucket *g;
-       u64 sectors_to_move = 0;
-       size_t buckets_to_move, buckets_unused = 0;
-       struct bucket_heap_entry e, *i;
-       int reserve_sectors;
-
-       if (!have_copygc_reserve(ca)) {
-               struct closure cl;
-
-               closure_init_stack(&cl);
-               while (1) {
-                       closure_wait(&c->freelist_wait, &cl);
-                       if (have_copygc_reserve(ca))
-                               break;
-                       closure_sync(&cl);
-               }
-               closure_wake_up(&c->freelist_wait);
-       }
-
-       reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+       copygc_heap *h = &ca->copygc_heap;
+       struct copygc_heap_entry e, *i;
+       struct bucket_array *buckets;
+       struct bch_move_stats move_stats;
+       u64 sectors_to_move = 0, sectors_not_moved = 0;
+       u64 buckets_to_move, buckets_not_moved = 0;
+       size_t b;
+       int ret;
 
-       trace_moving_gc_start(ca);
+       memset(&move_stats, 0, sizeof(move_stats));
+       closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
 
        /*
         * Find buckets with lowest sector counts, skipping completely
@@ -207,60 +141,90 @@ static void bch2_moving_gc(struct bch_dev *ca)
         * and repeatedly replacing the maximum element until all
         * buckets have been visited.
         */
+       h->used = 0;
 
        /*
         * We need bucket marks to be up to date - gc can't be recalculating
         * them:
         */
        down_read(&c->gc_lock);
-       ca->copygc_heap.used = 0;
-       for_each_bucket(g, ca) {
-               struct bucket_mark m = READ_ONCE(g->mark);
-               struct bucket_heap_entry e = { g - ca->buckets, m };
+       down_read(&ca->bucket_lock);
+       buckets = bucket_array(ca);
 
-               if (bucket_unused(m)) {
-                       buckets_unused++;
-                       continue;
-               }
+       for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
+               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+               struct copygc_heap_entry e;
 
                if (m.owned_by_allocator ||
-                   m.data_type != BUCKET_DATA)
-                       continue;
-
-               if (bucket_sectors_used(m) >= ca->mi.bucket_size)
+                   m.data_type != BCH_DATA_USER ||
+                   !bucket_sectors_used(m) ||
+                   bucket_sectors_used(m) >= ca->mi.bucket_size)
                        continue;
 
-               heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
+               e = (struct copygc_heap_entry) {
+                       .gen            = m.gen,
+                       .sectors        = bucket_sectors_used(m),
+                       .offset         = bucket_to_sector(ca, b),
+               };
+               heap_add_or_replace(h, e, -sectors_used_cmp, NULL);
        }
+       up_read(&ca->bucket_lock);
        up_read(&c->gc_lock);
 
-       for (i = ca->copygc_heap.data;
-            i < ca->copygc_heap.data + ca->copygc_heap.used;
-            i++)
-               sectors_to_move += bucket_sectors_used(i->mark);
+       for (i = h->data; i < h->data + h->used; i++)
+               sectors_to_move += i->sectors;
 
        while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
-               BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
-               sectors_to_move -= bucket_sectors_used(e.mark);
+               BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL));
+               sectors_to_move -= e.sectors;
        }
 
-       buckets_to_move = ca->copygc_heap.used;
+       buckets_to_move = h->used;
+
+       if (!buckets_to_move)
+               return;
+
+       eytzinger0_sort(h->data, h->used,
+                       sizeof(h->data[0]),
+                       bucket_offset_cmp, NULL);
+
+       ret = bch2_move_data(c, &ca->copygc_pd.rate,
+                            writepoint_ptr(&ca->copygc_write_point),
+                            POS_MIN, POS_MAX,
+                            copygc_pred, ca,
+                            &move_stats);
 
-       eytzinger0_sort(ca->copygc_heap.data,
-                       ca->copygc_heap.used,
-                       sizeof(ca->copygc_heap.data[0]),
-                       bucket_idx_cmp, NULL);
+       down_read(&ca->bucket_lock);
+       buckets = bucket_array(ca);
+       for (i = h->data; i < h->data + h->used; i++) {
+               size_t b = sector_to_bucket(ca, i->offset);
+               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+
+               if (i->gen == m.gen && bucket_sectors_used(m)) {
+                       sectors_not_moved += bucket_sectors_used(m);
+                       buckets_not_moved++;
+               }
+       }
+       up_read(&ca->bucket_lock);
+
+       if (sectors_not_moved && !ret)
+               bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+                        sectors_not_moved, sectors_to_move,
+                        buckets_not_moved, buckets_to_move);
 
-       read_moving(ca, buckets_to_move, sectors_to_move);
+       trace_copygc(ca,
+                    atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
+                    buckets_to_move, buckets_not_moved);
 }
 
-static int bch2_moving_gc_thread(void *arg)
+static int bch2_copygc_thread(void *arg)
 {
        struct bch_dev *ca = arg;
        struct bch_fs *c = ca->fs;
        struct io_clock *clock = &c->io_clock[WRITE];
+       struct bch_dev_usage usage;
        unsigned long last;
-       u64 available, want, next;
+       u64 available, fragmented, reserve, next;
 
        set_freezable();
 
@@ -269,60 +233,77 @@ static int bch2_moving_gc_thread(void *arg)
                        break;
 
                last = atomic_long_read(&clock->now);
+
+               reserve = ca->copygc_threshold;
+
+               usage = bch2_dev_usage_read(c, ca);
+
+               available = __dev_buckets_available(ca, usage) *
+                       ca->mi.bucket_size;
+               if (available > reserve) {
+                       next = last + available - reserve;
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
+                       continue;
+               }
+
                /*
-                * don't start copygc until less than half the gc reserve is
-                * available:
+                * don't start copygc until there's more than half the copygc
+                * reserve of fragmented space:
                 */
-               available = dev_buckets_available(ca);
-               want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
-                                c->opts.gc_reserve_percent, 200);
-               if (available > want) {
-                       next = last + (available - want) *
-                               ca->mi.bucket_size;
-                       bch2_kthread_io_clock_wait(clock, next);
+               fragmented = usage.sectors_fragmented;
+               if (fragmented < reserve) {
+                       next = last + reserve - fragmented;
+                       bch2_kthread_io_clock_wait(clock, next,
+                                       MAX_SCHEDULE_TIMEOUT);
                        continue;
                }
 
-               bch2_moving_gc(ca);
+               bch2_copygc(c, ca);
        }
 
        return 0;
 }
 
-void bch2_moving_gc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_dev *ca)
 {
-       ca->moving_gc_pd.rate.rate = UINT_MAX;
-       bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
+       ca->copygc_pd.rate.rate = UINT_MAX;
+       bch2_ratelimit_reset(&ca->copygc_pd.rate);
 
-       if (ca->moving_gc_read)
-               kthread_stop(ca->moving_gc_read);
-       ca->moving_gc_read = NULL;
+       if (ca->copygc_thread) {
+               kthread_stop(ca->copygc_thread);
+               put_task_struct(ca->copygc_thread);
+       }
+       ca->copygc_thread = NULL;
 }
 
-int bch2_moving_gc_start(struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 {
        struct task_struct *t;
 
-       BUG_ON(ca->moving_gc_read);
+       BUG_ON(ca->copygc_thread);
 
-       if (ca->fs->opts.nochanges)
+       if (c->opts.nochanges)
                return 0;
 
-       if (bch2_fs_init_fault("moving_gc_start"))
+       if (bch2_fs_init_fault("copygc_start"))
                return -ENOMEM;
 
-       t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
+       t = kthread_create(bch2_copygc_thread, ca,
+                          "bch_copygc[%s]", ca->name);
        if (IS_ERR(t))
                return PTR_ERR(t);
 
-       ca->moving_gc_read = t;
-       wake_up_process(ca->moving_gc_read);
+       get_task_struct(t);
+
+       ca->copygc_thread = t;
+       wake_up_process(ca->copygc_thread);
 
        return 0;
 }
 
-void bch2_dev_moving_gc_init(struct bch_dev *ca)
+void bch2_dev_copygc_init(struct bch_dev *ca)
 {
-       bch2_pd_controller_init(&ca->moving_gc_pd);
-       ca->moving_gc_pd.d_term = 0;
+       bch2_pd_controller_init(&ca->copygc_pd);
+       ca->copygc_pd.d_term = 0;
 }