]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/movinggc.c
125159ee5ec269ba93bfcb22f5cbb3b6626981fd
[bcachefs-tools-debian] / libbcachefs / movinggc.c
1 /*
2  * Moving/copying garbage collector
3  *
4  * Copyright 2012 Google, Inc.
5  */
6
7 #include "bcachefs.h"
8 #include "btree_iter.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "extents.h"
12 #include "eytzinger.h"
13 #include "io.h"
14 #include "keylist.h"
15 #include "move.h"
16 #include "movinggc.h"
17 #include "super-io.h"
18
19 #include <trace/events/bcachefs.h>
20 #include <linux/freezer.h>
21 #include <linux/kthread.h>
22 #include <linux/math64.h>
23 #include <linux/sort.h>
24 #include <linux/wait.h>
25
26 /* Moving GC - IO loop */
27
28 static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
29 {
30         const struct bucket_heap_entry *l = _l;
31         const struct bucket_heap_entry *r = _r;
32
33         if (l->bucket < r->bucket)
34                 return -1;
35         if (l->bucket > r->bucket)
36                 return 1;
37         return 0;
38 }
39
40 static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
41                                                 struct bkey_s_c k)
42 {
43         bucket_heap *h = &ca->copygc_heap;
44         const struct bch_extent_ptr *ptr;
45
46         if (bkey_extent_is_data(k.k) &&
47             (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
48                                           ca->dev_idx))) {
49                 struct bucket_heap_entry search = {
50                         .bucket = PTR_BUCKET_NR(ca, ptr)
51                 };
52
53                 size_t i = eytzinger0_find(h->data, h->used,
54                                            sizeof(h->data[0]),
55                                            bucket_idx_cmp, &search);
56
57                 if (i < h->used)
58                         return ptr;
59         }
60
61         return NULL;
62 }
63
64 static int issue_moving_gc_move(struct bch_dev *ca,
65                                 struct moving_context *ctxt,
66                                 struct bkey_s_c k)
67 {
68         struct bch_fs *c = ca->fs;
69         const struct bch_extent_ptr *ptr;
70         int ret;
71
72         ptr = moving_pred(ca, k);
73         if (!ptr) /* We raced - bucket's been reused */
74                 return 0;
75
76         ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
77         if (!ret)
78                 trace_gc_copy(k.k);
79         else
80                 trace_moving_gc_alloc_fail(c, k.k->size);
81         return ret;
82 }
83
84 static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
85                         u64 sectors_to_move)
86 {
87         struct bch_fs *c = ca->fs;
88         bucket_heap *h = &ca->copygc_heap;
89         struct moving_context ctxt;
90         struct btree_iter iter;
91         struct bkey_s_c k;
92         u64 sectors_not_moved = 0;
93         size_t buckets_not_moved = 0;
94         struct bucket_heap_entry *i;
95
96         bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
97         bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
98                                 SECTORS_IN_FLIGHT_PER_DEVICE);
99         bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
100                              BTREE_ITER_PREFETCH);
101
102         while (1) {
103                 if (kthread_should_stop())
104                         goto out;
105                 if (bch2_move_ctxt_wait(&ctxt))
106                         goto out;
107                 k = bch2_btree_iter_peek(&iter);
108                 if (!k.k)
109                         break;
110                 if (btree_iter_err(k))
111                         goto out;
112
113                 if (!moving_pred(ca, k))
114                         goto next;
115
116                 if (issue_moving_gc_move(ca, &ctxt, k)) {
117                         bch2_btree_iter_unlock(&iter);
118
119                         /* memory allocation failure, wait for some IO to finish */
120                         bch2_move_ctxt_wait_for_io(&ctxt);
121                         continue;
122                 }
123 next:
124                 bch2_btree_iter_advance_pos(&iter);
125                 //bch2_btree_iter_cond_resched(&iter);
126
127                 /* unlock before calling moving_context_wait() */
128                 bch2_btree_iter_unlock(&iter);
129                 cond_resched();
130         }
131
132         bch2_btree_iter_unlock(&iter);
133         bch2_move_ctxt_exit(&ctxt);
134         trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
135                                    buckets_to_move);
136
137         /* don't check this if we bailed out early: */
138         for (i = h->data; i < h->data + h->used; i++) {
139                 struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
140
141                 if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
142                         sectors_not_moved += bucket_sectors_used(m);
143                         buckets_not_moved++;
144                 }
145         }
146
147         if (sectors_not_moved)
148                 bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
149                          sectors_not_moved, sectors_to_move,
150                          buckets_not_moved, buckets_to_move);
151         return;
152 out:
153         bch2_btree_iter_unlock(&iter);
154         bch2_move_ctxt_exit(&ctxt);
155         trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
156                                    buckets_to_move);
157 }
158
159 static bool have_copygc_reserve(struct bch_dev *ca)
160 {
161         bool ret;
162
163         spin_lock(&ca->freelist_lock);
164         ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
165                 COPYGC_BUCKETS_PER_ITER(ca);
166         spin_unlock(&ca->freelist_lock);
167
168         return ret;
169 }
170
171 static inline int sectors_used_cmp(bucket_heap *heap,
172                                    struct bucket_heap_entry l,
173                                    struct bucket_heap_entry r)
174 {
175         return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
176 }
177
178 static void bch2_moving_gc(struct bch_dev *ca)
179 {
180         struct bch_fs *c = ca->fs;
181         struct bucket *g;
182         u64 sectors_to_move = 0;
183         size_t buckets_to_move, buckets_unused = 0;
184         struct bucket_heap_entry e, *i;
185         int reserve_sectors;
186
187         if (!have_copygc_reserve(ca)) {
188                 struct closure cl;
189
190                 closure_init_stack(&cl);
191                 while (1) {
192                         closure_wait(&c->freelist_wait, &cl);
193                         if (have_copygc_reserve(ca))
194                                 break;
195                         closure_sync(&cl);
196                 }
197                 closure_wake_up(&c->freelist_wait);
198         }
199
200         reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
201
202         trace_moving_gc_start(ca);
203
204         /*
205          * Find buckets with lowest sector counts, skipping completely
206          * empty buckets, by building a maxheap sorted by sector count,
207          * and repeatedly replacing the maximum element until all
208          * buckets have been visited.
209          */
210
211         /*
212          * We need bucket marks to be up to date - gc can't be recalculating
213          * them:
214          */
215         down_read(&c->gc_lock);
216         ca->copygc_heap.used = 0;
217         for_each_bucket(g, ca) {
218                 struct bucket_mark m = READ_ONCE(g->mark);
219                 struct bucket_heap_entry e = { g - ca->buckets, m };
220
221                 if (bucket_unused(m)) {
222                         buckets_unused++;
223                         continue;
224                 }
225
226                 if (m.owned_by_allocator ||
227                     m.data_type != BUCKET_DATA)
228                         continue;
229
230                 if (bucket_sectors_used(m) >= ca->mi.bucket_size)
231                         continue;
232
233                 heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
234         }
235         up_read(&c->gc_lock);
236
237         for (i = ca->copygc_heap.data;
238              i < ca->copygc_heap.data + ca->copygc_heap.used;
239              i++)
240                 sectors_to_move += bucket_sectors_used(i->mark);
241
242         while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
243                 BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
244                 sectors_to_move -= bucket_sectors_used(e.mark);
245         }
246
247         buckets_to_move = ca->copygc_heap.used;
248
249         eytzinger0_sort(ca->copygc_heap.data,
250                         ca->copygc_heap.used,
251                         sizeof(ca->copygc_heap.data[0]),
252                         bucket_idx_cmp, NULL);
253
254         read_moving(ca, buckets_to_move, sectors_to_move);
255 }
256
257 static int bch2_moving_gc_thread(void *arg)
258 {
259         struct bch_dev *ca = arg;
260         struct bch_fs *c = ca->fs;
261         struct io_clock *clock = &c->io_clock[WRITE];
262         unsigned long last;
263         u64 available, want, next;
264
265         set_freezable();
266
267         while (!kthread_should_stop()) {
268                 if (kthread_wait_freezable(c->copy_gc_enabled))
269                         break;
270
271                 last = atomic_long_read(&clock->now);
272                 /*
273                  * don't start copygc until less than half the gc reserve is
274                  * available:
275                  */
276                 available = dev_buckets_available(ca);
277                 want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
278                                  c->opts.gc_reserve_percent, 200);
279                 if (available > want) {
280                         next = last + (available - want) *
281                                 ca->mi.bucket_size;
282                         bch2_kthread_io_clock_wait(clock, next);
283                         continue;
284                 }
285
286                 bch2_moving_gc(ca);
287         }
288
289         return 0;
290 }
291
292 void bch2_moving_gc_stop(struct bch_dev *ca)
293 {
294         ca->moving_gc_pd.rate.rate = UINT_MAX;
295         bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
296
297         if (ca->moving_gc_read)
298                 kthread_stop(ca->moving_gc_read);
299         ca->moving_gc_read = NULL;
300 }
301
302 int bch2_moving_gc_start(struct bch_dev *ca)
303 {
304         struct task_struct *t;
305
306         BUG_ON(ca->moving_gc_read);
307
308         if (ca->fs->opts.nochanges)
309                 return 0;
310
311         if (bch2_fs_init_fault("moving_gc_start"))
312                 return -ENOMEM;
313
314         t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
315         if (IS_ERR(t))
316                 return PTR_ERR(t);
317
318         ca->moving_gc_read = t;
319         wake_up_process(ca->moving_gc_read);
320
321         return 0;
322 }
323
324 void bch2_dev_moving_gc_init(struct bch_dev *ca)
325 {
326         bch2_pd_controller_init(&ca->moving_gc_pd);
327         ca->moving_gc_pd.d_term = 0;
328 }