git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/movinggc.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Moving/copying garbage collector
   4  *
   5  * Copyright 2012 Google, Inc.
   6  */
   7
   8 #include "bcachefs.h"
   9 #include "alloc_foreground.h"
  10 #include "btree_iter.h"
  11 #include "btree_update.h"
  12 #include "buckets.h"
  13 #include "clock.h"
  14 #include "disk_groups.h"
  15 #include "error.h"
  16 #include "extents.h"
  17 #include "eytzinger.h"
  18 #include "io.h"
  19 #include "keylist.h"
  20 #include "move.h"
  21 #include "movinggc.h"
  22 #include "super-io.h"
  23
  24 #include <trace/events/bcachefs.h>
  25 #include <linux/freezer.h>
  26 #include <linux/kthread.h>
  27 #include <linux/math64.h>
  28 #include <linux/sched/task.h>
  29 #include <linux/sort.h>
  30 #include <linux/wait.h>
  31
  32 /*
  33  * We can't use the entire copygc reserve in one iteration of copygc: we may
  34  * need the buckets we're freeing up to go back into the copygc reserve to make
  35  * forward progress, but if the copygc reserve is full they'll be available for
  36  * any allocation - and it's possible that in a given iteration, we free up most
  37  * of the buckets we're going to free before we allocate most of the buckets
  38  * we're going to allocate.
  39  *
  40  * If we only use half of the reserve per iteration, then in steady state we'll
  41  * always have room in the reserve for the buckets we're going to need in the
  42  * next iteration:
  43  */
  44 #define COPYGC_BUCKETS_PER_ITER(ca)                                     \
  45         ((ca)->free[RESERVE_MOVINGGC].size / 2)
  46
  47 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
  48 {
  49         const struct copygc_heap_entry *l = _l;
  50         const struct copygc_heap_entry *r = _r;
  51
  52         return  cmp_int(l->dev,    r->dev) ?:
  53                 cmp_int(l->offset, r->offset);
  54 }
  55
  56 static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k)
  57 {
  58         copygc_heap *h = &c->copygc_heap;
  59         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
  60         const struct bch_extent_ptr *ptr;
  61
  62         bkey_for_each_ptr(ptrs, ptr) {
  63                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
  64                 struct copygc_heap_entry search = {
  65                         .dev = ptr->dev,
  66                         .offset = ptr->offset
  67                 };
  68
  69                 ssize_t i = eytzinger0_find_le(h->data, h->used,
  70                                                sizeof(h->data[0]),
  71                                                bucket_offset_cmp, &search);
  72 #if 0
  73                 /* eytzinger search verify code: */
  74                 ssize_t j = -1, k;
  75
  76                 for (k = 0; k < h->used; k++)
  77                         if (h->data[k].offset <= ptr->offset &&
  78                             (j < 0 || h->data[k].offset > h->data[j].offset))
  79                                 j = k;
  80
  81                 BUG_ON(i != j);
  82 #endif
  83                 if (i >= 0 &&
  84                     ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
  85                     ptr->gen == h->data[i].gen)
  86                         return ptr->dev;
  87         }
  88
  89         return -1;
  90 }
  91
  92 static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
  93                                  struct bkey_s_c k,
  94                                  struct bch_io_opts *io_opts,
  95                                  struct data_opts *data_opts)
  96 {
  97         int dev_idx = __copygc_pred(c, k);
  98         if (dev_idx < 0)
  99                 return DATA_SKIP;
 100
 101         data_opts->target               = io_opts->background_target;
 102         data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
 103         data_opts->rewrite_dev          = dev_idx;
 104         return DATA_REWRITE;
 105 }
 106
 107 static bool have_copygc_reserve(struct bch_dev *ca)
 108 {
 109         bool ret;
 110
 111         spin_lock(&ca->fs->freelist_lock);
 112         ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
 113                 ca->allocator_state != ALLOCATOR_RUNNING;
 114         spin_unlock(&ca->fs->freelist_lock);
 115
 116         return ret;
 117 }
 118
 119 static inline int fragmentation_cmp(copygc_heap *heap,
 120                                    struct copygc_heap_entry l,
 121                                    struct copygc_heap_entry r)
 122 {
 123         return cmp_int(l.fragmentation, r.fragmentation);
 124 }
 125
 126 static int bch2_copygc(struct bch_fs *c)
 127 {
 128         copygc_heap *h = &c->copygc_heap;
 129         struct copygc_heap_entry e, *i;
 130         struct bucket_array *buckets;
 131         struct bch_move_stats move_stats;
 132         u64 sectors_to_move = 0, sectors_not_moved = 0;
 133         u64 sectors_reserved = 0;
 134         u64 buckets_to_move, buckets_not_moved = 0;
 135         struct bch_dev *ca;
 136         unsigned dev_idx;
 137         size_t b, heap_size = 0;
 138         int ret;
 139
 140         memset(&move_stats, 0, sizeof(move_stats));
 141         /*
 142          * Find buckets with lowest sector counts, skipping completely
 143          * empty buckets, by building a maxheap sorted by sector count,
 144          * and repeatedly replacing the maximum element until all
 145          * buckets have been visited.
 146          */
 147         h->used = 0;
 148
 149         for_each_rw_member(ca, c, dev_idx)
 150                 heap_size += ca->mi.nbuckets >> 7;
 151
 152         if (h->size < heap_size) {
 153                 free_heap(&c->copygc_heap);
 154                 if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
 155                         bch_err(c, "error allocating copygc heap");
 156                         return 0;
 157                 }
 158         }
 159
 160         for_each_rw_member(ca, c, dev_idx) {
 161                 closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
 162
 163                 spin_lock(&ca->fs->freelist_lock);
 164                 sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
 165                 spin_unlock(&ca->fs->freelist_lock);
 166
 167                 down_read(&ca->bucket_lock);
 168                 buckets = bucket_array(ca);
 169
 170                 for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
 171                         struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
 172                         struct copygc_heap_entry e;
 173
 174                         if (m.owned_by_allocator ||
 175                             m.data_type != BCH_DATA_user ||
 176                             !bucket_sectors_used(m) ||
 177                             bucket_sectors_used(m) >= ca->mi.bucket_size)
 178                                 continue;
 179
 180                         e = (struct copygc_heap_entry) {
 181                                 .dev            = dev_idx,
 182                                 .gen            = m.gen,
 183                                 .fragmentation  = bucket_sectors_used(m) * (1U << 15)
 184                                         / ca->mi.bucket_size,
 185                                 .sectors        = bucket_sectors_used(m),
 186                                 .offset         = bucket_to_sector(ca, b),
 187                         };
 188                         heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
 189                 }
 190                 up_read(&ca->bucket_lock);
 191         }
 192
 193         if (!sectors_reserved) {
 194                 bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
 195                 return -1;
 196         }
 197
 198         for (i = h->data; i < h->data + h->used; i++)
 199                 sectors_to_move += i->sectors;
 200
 201         while (sectors_to_move > sectors_reserved) {
 202                 BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
 203                 sectors_to_move -= e.sectors;
 204         }
 205
 206         buckets_to_move = h->used;
 207
 208         if (!buckets_to_move)
 209                 return 0;
 210
 211         eytzinger0_sort(h->data, h->used,
 212                         sizeof(h->data[0]),
 213                         bucket_offset_cmp, NULL);
 214
 215         ret = bch2_move_data(c, &c->copygc_pd.rate,
 216                              writepoint_ptr(&c->copygc_write_point),
 217                              POS_MIN, POS_MAX,
 218                              copygc_pred, NULL,
 219                              &move_stats);
 220
 221         for_each_rw_member(ca, c, dev_idx) {
 222                 down_read(&ca->bucket_lock);
 223                 buckets = bucket_array(ca);
 224                 for (i = h->data; i < h->data + h->used; i++) {
 225                         struct bucket_mark m;
 226                         size_t b;
 227
 228                         if (i->dev != dev_idx)
 229                                 continue;
 230
 231                         b = sector_to_bucket(ca, i->offset);
 232                         m = READ_ONCE(buckets->b[b].mark);
 233
 234                         if (i->gen == m.gen &&
 235                             bucket_sectors_used(m)) {
 236                                 sectors_not_moved += bucket_sectors_used(m);
 237                                 buckets_not_moved++;
 238                         }
 239                 }
 240                 up_read(&ca->bucket_lock);
 241         }
 242
 243         if (sectors_not_moved && !ret)
 244                 bch_warn_ratelimited(c,
 245                         "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
 246                          sectors_not_moved, sectors_to_move,
 247                          buckets_not_moved, buckets_to_move,
 248                          atomic64_read(&move_stats.sectors_moved),
 249                          atomic64_read(&move_stats.keys_raced),
 250                          atomic64_read(&move_stats.sectors_raced));
 251
 252         trace_copygc(c,
 253                      atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
 254                      buckets_to_move, buckets_not_moved);
 255         return 0;
 256 }
 257
 258 /*
 259  * Copygc runs when the amount of fragmented data is above some arbitrary
 260  * threshold:
 261  *
 262  * The threshold at the limit - when the device is full - is the amount of space
 263  * we reserved in bch2_recalc_capacity; we can't have more than that amount of
 264  * disk space stranded due to fragmentation and store everything we have
 265  * promised to store.
 266  *
 267  * But we don't want to be running copygc unnecessarily when the device still
 268  * has plenty of free space - rather, we want copygc to smoothly run every so
 269  * often and continually reduce the amount of fragmented space as the device
 270  * fills up. So, we increase the threshold by half the current free space.
 271  */
 272 unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
 273 {
 274         struct bch_dev *ca;
 275         unsigned dev_idx;
 276         u64 fragmented_allowed = c->copygc_threshold;
 277         u64 fragmented = 0;
 278
 279         for_each_rw_member(ca, c, dev_idx) {
 280                 struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 281
 282                 fragmented_allowed += ((__dev_buckets_available(ca, usage) *
 283                                         ca->mi.bucket_size) >> 1);
 284                 fragmented += usage.sectors_fragmented;
 285         }
 286
 287         return max_t(s64, 0, fragmented_allowed - fragmented);
 288 }
 289
 290 static int bch2_copygc_thread(void *arg)
 291 {
 292         struct bch_fs *c = arg;
 293         struct io_clock *clock = &c->io_clock[WRITE];
 294         unsigned long last, wait;
 295
 296         set_freezable();
 297
 298         while (!kthread_should_stop()) {
 299                 if (kthread_wait_freezable(c->copy_gc_enabled))
 300                         break;
 301
 302                 last = atomic_long_read(&clock->now);
 303                 wait = bch2_copygc_wait_amount(c);
 304
 305                 if (wait > clock->max_slop) {
 306                         bch2_kthread_io_clock_wait(clock, last + wait,
 307                                         MAX_SCHEDULE_TIMEOUT);
 308                         continue;
 309                 }
 310
 311                 if (bch2_copygc(c))
 312                         break;
 313         }
 314
 315         return 0;
 316 }
 317
 318 void bch2_copygc_stop(struct bch_fs *c)
 319 {
 320         c->copygc_pd.rate.rate = UINT_MAX;
 321         bch2_ratelimit_reset(&c->copygc_pd.rate);
 322
 323         if (c->copygc_thread) {
 324                 kthread_stop(c->copygc_thread);
 325                 put_task_struct(c->copygc_thread);
 326         }
 327         c->copygc_thread = NULL;
 328 }
 329
 330 int bch2_copygc_start(struct bch_fs *c)
 331 {
 332         struct task_struct *t;
 333
 334         if (c->copygc_thread)
 335                 return 0;
 336
 337         if (c->opts.nochanges)
 338                 return 0;
 339
 340         if (bch2_fs_init_fault("copygc_start"))
 341                 return -ENOMEM;
 342
 343         t = kthread_create(bch2_copygc_thread, c, "bch_copygc");
 344         if (IS_ERR(t))
 345                 return PTR_ERR(t);
 346
 347         get_task_struct(t);
 348
 349         c->copygc_thread = t;
 350         wake_up_process(c->copygc_thread);
 351
 352         return 0;
 353 }
 354
 355 void bch2_fs_copygc_init(struct bch_fs *c)
 356 {
 357         bch2_pd_controller_init(&c->copygc_pd);
 358         c->copygc_pd.d_term = 0;
 359 }