git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/movinggc.c

   1 /*
   2  * Moving/copying garbage collector
   3  *
   4  * Copyright 2012 Google, Inc.
   5  */
   6
   7 #include "bcachefs.h"
   8 #include "btree_iter.h"
   9 #include "btree_update.h"
  10 #include "buckets.h"
  11 #include "clock.h"
  12 #include "extents.h"
  13 #include "eytzinger.h"
  14 #include "io.h"
  15 #include "keylist.h"
  16 #include "move.h"
  17 #include "movinggc.h"
  18 #include "super-io.h"
  19
  20 #include <trace/events/bcachefs.h>
  21 #include <linux/freezer.h>
  22 #include <linux/kthread.h>
  23 #include <linux/math64.h>
  24 #include <linux/sched/task.h>
  25 #include <linux/sort.h>
  26 #include <linux/wait.h>
  27
  28 /*
  29  * We can't use the entire copygc reserve in one iteration of copygc: we may
  30  * need the buckets we're freeing up to go back into the copygc reserve to make
  31  * forward progress, but if the copygc reserve is full they'll be available for
  32  * any allocation - and it's possible that in a given iteration, we free up most
  33  * of the buckets we're going to free before we allocate most of the buckets
  34  * we're going to allocate.
  35  *
  36  * If we only use half of the reserve per iteration, then in steady state we'll
  37  * always have room in the reserve for the buckets we're going to need in the
  38  * next iteration:
  39  */
  40 #define COPYGC_BUCKETS_PER_ITER(ca)                                     \
  41         ((ca)->free[RESERVE_MOVINGGC].size / 2)
  42
  43 /*
  44  * Max sectors to move per iteration: Have to take into account internal
  45  * fragmentation from the multiple write points for each generation:
  46  */
  47 #define COPYGC_SECTORS_PER_ITER(ca)                                     \
  48         ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
  49
  50 static inline int sectors_used_cmp(copygc_heap *heap,
  51                                    struct copygc_heap_entry l,
  52                                    struct copygc_heap_entry r)
  53 {
  54         return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
  55 }
  56
  57 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
  58 {
  59         const struct copygc_heap_entry *l = _l;
  60         const struct copygc_heap_entry *r = _r;
  61
  62         return (l->offset > r->offset) - (l->offset < r->offset);
  63 }
  64
  65 static bool __copygc_pred(struct bch_dev *ca,
  66                           struct bkey_s_c_extent e)
  67 {
  68         copygc_heap *h = &ca->copygc_heap;
  69         const struct bch_extent_ptr *ptr =
  70                 bch2_extent_has_device(e, ca->dev_idx);
  71
  72         if (ptr) {
  73                 struct copygc_heap_entry search = { .offset = ptr->offset };
  74
  75                 ssize_t i = eytzinger0_find_le(h->data, h->used,
  76                                                sizeof(h->data[0]),
  77                                                bucket_offset_cmp, &search);
  78
  79                 return (i >= 0 &&
  80                         ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
  81                         ptr->gen == h->data[i].mark.gen);
  82         }
  83
  84         return false;
  85 }
  86
  87 static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
  88                                  enum bkey_type type,
  89                                  struct bkey_s_c_extent e,
  90                                  struct bch_io_opts *io_opts,
  91                                  struct data_opts *data_opts)
  92 {
  93         struct bch_dev *ca = arg;
  94
  95         if (!__copygc_pred(ca, e))
  96                 return DATA_SKIP;
  97
  98         data_opts->target               = dev_to_target(ca->dev_idx);
  99         data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
 100         data_opts->rewrite_dev          = ca->dev_idx;
 101         return DATA_REWRITE;
 102 }
 103
 104 static bool have_copygc_reserve(struct bch_dev *ca)
 105 {
 106         bool ret;
 107
 108         spin_lock(&ca->freelist_lock);
 109         ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
 110                 ca->allocator_blocked;
 111         spin_unlock(&ca->freelist_lock);
 112
 113         return ret;
 114 }
 115
 116 static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 117 {
 118         copygc_heap *h = &ca->copygc_heap;
 119         struct copygc_heap_entry e, *i;
 120         struct bucket_array *buckets;
 121         struct bch_move_stats move_stats;
 122         u64 sectors_to_move = 0, sectors_not_moved = 0;
 123         u64 buckets_to_move, buckets_not_moved = 0;
 124         size_t b;
 125         int ret;
 126
 127         memset(&move_stats, 0, sizeof(move_stats));
 128         closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
 129
 130         /*
 131          * Find buckets with lowest sector counts, skipping completely
 132          * empty buckets, by building a maxheap sorted by sector count,
 133          * and repeatedly replacing the maximum element until all
 134          * buckets have been visited.
 135          */
 136         h->used = 0;
 137
 138         /*
 139          * We need bucket marks to be up to date - gc can't be recalculating
 140          * them:
 141          */
 142         down_read(&c->gc_lock);
 143         down_read(&ca->bucket_lock);
 144         buckets = bucket_array(ca);
 145
 146         for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
 147                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
 148                 struct copygc_heap_entry e;
 149
 150                 if (m.owned_by_allocator ||
 151                     m.data_type != BCH_DATA_USER ||
 152                     !bucket_sectors_used(m) ||
 153                     bucket_sectors_used(m) >= ca->mi.bucket_size)
 154                         continue;
 155
 156                 e = (struct copygc_heap_entry) {
 157                         .offset = bucket_to_sector(ca, b),
 158                         .mark   = m
 159                 };
 160                 heap_add_or_replace(h, e, -sectors_used_cmp);
 161         }
 162         up_read(&ca->bucket_lock);
 163         up_read(&c->gc_lock);
 164
 165         for (i = h->data; i < h->data + h->used; i++)
 166                 sectors_to_move += bucket_sectors_used(i->mark);
 167
 168         while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
 169                 BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
 170                 sectors_to_move -= bucket_sectors_used(e.mark);
 171         }
 172
 173         buckets_to_move = h->used;
 174
 175         if (!buckets_to_move)
 176                 return;
 177
 178         eytzinger0_sort(h->data, h->used,
 179                         sizeof(h->data[0]),
 180                         bucket_offset_cmp, NULL);
 181
 182         ret = bch2_move_data(c, &ca->copygc_pd.rate,
 183                              writepoint_ptr(&ca->copygc_write_point),
 184                              POS_MIN, POS_MAX,
 185                              copygc_pred, ca,
 186                              &move_stats);
 187
 188         down_read(&ca->bucket_lock);
 189         buckets = bucket_array(ca);
 190         for (i = h->data; i < h->data + h->used; i++) {
 191                 size_t b = sector_to_bucket(ca, i->offset);
 192                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
 193
 194                 if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
 195                         sectors_not_moved += bucket_sectors_used(m);
 196                         buckets_not_moved++;
 197                 }
 198         }
 199         up_read(&ca->bucket_lock);
 200
 201         if (sectors_not_moved && !ret)
 202                 bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
 203                          sectors_not_moved, sectors_to_move,
 204                          buckets_not_moved, buckets_to_move);
 205
 206         trace_copygc(ca,
 207                      atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 208                      buckets_to_move, buckets_not_moved);
 209 }
 210
 211 static int bch2_copygc_thread(void *arg)
 212 {
 213         struct bch_dev *ca = arg;
 214         struct bch_fs *c = ca->fs;
 215         struct io_clock *clock = &c->io_clock[WRITE];
 216         struct bch_dev_usage usage;
 217         unsigned long last;
 218         u64 available, fragmented, reserve, next;
 219
 220         set_freezable();
 221
 222         while (!kthread_should_stop()) {
 223                 if (kthread_wait_freezable(c->copy_gc_enabled))
 224                         break;
 225
 226                 last = atomic_long_read(&clock->now);
 227
 228                 reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
 229                                  ca->mi.bucket_size *
 230                                  c->opts.gc_reserve_percent, 200);
 231
 232                 usage = bch2_dev_usage_read(c, ca);
 233
 234                 /*
 235                  * don't start copygc until less than half the gc reserve is
 236                  * available:
 237                  */
 238                 available = __dev_buckets_available(ca, usage) *
 239                         ca->mi.bucket_size;
 240                 if (available > reserve) {
 241                         next = last + available - reserve;
 242                         bch2_kthread_io_clock_wait(clock, next);
 243                         continue;
 244                 }
 245
 246                 /*
 247                  * don't start copygc until there's more than half the copygc
 248                  * reserve of fragmented space:
 249                  */
 250                 fragmented = usage.sectors_fragmented;
 251                 if (fragmented < reserve) {
 252                         next = last + reserve - fragmented;
 253                         bch2_kthread_io_clock_wait(clock, next);
 254                         continue;
 255                 }
 256
 257                 bch2_copygc(c, ca);
 258         }
 259
 260         return 0;
 261 }
 262
 263 void bch2_copygc_stop(struct bch_dev *ca)
 264 {
 265         ca->copygc_pd.rate.rate = UINT_MAX;
 266         bch2_ratelimit_reset(&ca->copygc_pd.rate);
 267
 268         if (ca->copygc_thread) {
 269                 kthread_stop(ca->copygc_thread);
 270                 put_task_struct(ca->copygc_thread);
 271         }
 272         ca->copygc_thread = NULL;
 273 }
 274
 275 int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 276 {
 277         struct task_struct *t;
 278
 279         BUG_ON(ca->copygc_thread);
 280
 281         if (c->opts.nochanges)
 282                 return 0;
 283
 284         if (bch2_fs_init_fault("copygc_start"))
 285                 return -ENOMEM;
 286
 287         t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
 288         if (IS_ERR(t))
 289                 return PTR_ERR(t);
 290
 291         get_task_struct(t);
 292
 293         ca->copygc_thread = t;
 294         wake_up_process(ca->copygc_thread);
 295
 296         return 0;
 297 }
 298
 299 void bch2_dev_copygc_init(struct bch_dev *ca)
 300 {
 301         bch2_pd_controller_init(&ca->copygc_pd);
 302         ca->copygc_pd.d_term = 0;
 303 }