]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcache/movinggc.c
bcache in userspace; userspace fsck
[bcachefs-tools-debian] / libbcache / movinggc.c
1 /*
2  * Moving/copying garbage collector
3  *
4  * Copyright 2012 Google, Inc.
5  */
6
7 #include "bcache.h"
8 #include "btree_iter.h"
9 #include "buckets.h"
10 #include "clock.h"
11 #include "extents.h"
12 #include "io.h"
13 #include "keylist.h"
14 #include "move.h"
15 #include "movinggc.h"
16
17 #include <trace/events/bcache.h>
18 #include <linux/freezer.h>
19 #include <linux/kthread.h>
20 #include <linux/wait.h>
21
22 /* Moving GC - IO loop */
23
24 static const struct bch_extent_ptr *moving_pred(struct cache *ca,
25                                                 struct bkey_s_c k)
26 {
27         const struct bch_extent_ptr *ptr;
28
29         if (bkey_extent_is_data(k.k)) {
30                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
31
32                 extent_for_each_ptr(e, ptr)
33                         if ((ca->sb.nr_this_dev == ptr->dev) &&
34                             PTR_BUCKET(ca, ptr)->mark.copygc)
35                                 return ptr;
36         }
37
38         return NULL;
39 }
40
41 static int issue_moving_gc_move(struct cache *ca,
42                                 struct moving_context *ctxt,
43                                 struct bkey_s_c k)
44 {
45         struct cache_set *c = ca->set;
46         const struct bch_extent_ptr *ptr;
47         int ret;
48
49         ptr = moving_pred(ca, k);
50         if (!ptr) /* We raced - bucket's been reused */
51                 return 0;
52
53         ret = bch_data_move(c, ctxt, &ca->copygc_write_point, k, ptr);
54         if (!ret)
55                 trace_bcache_gc_copy(k.k);
56         else
57                 trace_bcache_moving_gc_alloc_fail(c, k.k->size);
58         return ret;
59 }
60
61 static void read_moving(struct cache *ca, size_t buckets_to_move,
62                         u64 sectors_to_move)
63 {
64         struct cache_set *c = ca->set;
65         struct bucket *g;
66         struct moving_context ctxt;
67         struct btree_iter iter;
68         struct bkey_s_c k;
69         u64 sectors_not_moved = 0;
70         size_t buckets_not_moved = 0;
71
72         bch_ratelimit_reset(&ca->moving_gc_pd.rate);
73         bch_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
74                                 SECTORS_IN_FLIGHT_PER_DEVICE);
75         bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
76
77         while (1) {
78                 if (kthread_should_stop())
79                         goto out;
80                 if (bch_move_ctxt_wait(&ctxt))
81                         goto out;
82                 k = bch_btree_iter_peek(&iter);
83                 if (!k.k)
84                         break;
85                 if (btree_iter_err(k))
86                         goto out;
87
88                 if (!moving_pred(ca, k))
89                         goto next;
90
91                 if (issue_moving_gc_move(ca, &ctxt, k)) {
92                         bch_btree_iter_unlock(&iter);
93
94                         /* memory allocation failure, wait for some IO to finish */
95                         bch_move_ctxt_wait_for_io(&ctxt);
96                         continue;
97                 }
98 next:
99                 bch_btree_iter_advance_pos(&iter);
100                 //bch_btree_iter_cond_resched(&iter);
101
102                 /* unlock before calling moving_context_wait() */
103                 bch_btree_iter_unlock(&iter);
104                 cond_resched();
105         }
106
107         bch_btree_iter_unlock(&iter);
108         bch_move_ctxt_exit(&ctxt);
109         trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
110                                    buckets_to_move);
111
112         /* don't check this if we bailed out early: */
113         for_each_bucket(g, ca)
114                 if (g->mark.copygc && bucket_sectors_used(g)) {
115                         sectors_not_moved += bucket_sectors_used(g);
116                         buckets_not_moved++;
117                 }
118
119         if (sectors_not_moved)
120                 bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
121                          sectors_not_moved, sectors_to_move,
122                          buckets_not_moved, buckets_to_move);
123         return;
124 out:
125         bch_btree_iter_unlock(&iter);
126         bch_move_ctxt_exit(&ctxt);
127         trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
128                                    buckets_to_move);
129 }
130
131 static bool have_copygc_reserve(struct cache *ca)
132 {
133         bool ret;
134
135         spin_lock(&ca->freelist_lock);
136         ret = fifo_used(&ca->free[RESERVE_MOVINGGC]) >=
137                 COPYGC_BUCKETS_PER_ITER(ca);
138         spin_unlock(&ca->freelist_lock);
139
140         return ret;
141 }
142
143 static void bch_moving_gc(struct cache *ca)
144 {
145         struct cache_set *c = ca->set;
146         struct bucket *g;
147         struct bucket_mark new;
148         u64 sectors_to_move;
149         size_t buckets_to_move, buckets_unused = 0;
150         struct bucket_heap_entry e;
151         unsigned sectors_used, i;
152         int reserve_sectors;
153
154         if (!have_copygc_reserve(ca)) {
155                 struct closure cl;
156
157                 closure_init_stack(&cl);
158                 while (1) {
159                         closure_wait(&c->freelist_wait, &cl);
160                         if (have_copygc_reserve(ca))
161                                 break;
162                         closure_sync(&cl);
163                 }
164                 closure_wake_up(&c->freelist_wait);
165         }
166
167         reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
168
169         trace_bcache_moving_gc_start(ca);
170
171         /*
172          * Find buckets with lowest sector counts, skipping completely
173          * empty buckets, by building a maxheap sorted by sector count,
174          * and repeatedly replacing the maximum element until all
175          * buckets have been visited.
176          */
177
178         /*
179          * We need bucket marks to be up to date, so gc can't be recalculating
180          * them, and we don't want the allocator invalidating a bucket after
181          * we've decided to evacuate it but before we set copygc:
182          */
183         down_read(&c->gc_lock);
184         mutex_lock(&ca->heap_lock);
185         mutex_lock(&ca->set->bucket_lock);
186
187         ca->heap.used = 0;
188         for_each_bucket(g, ca) {
189                 bucket_cmpxchg(g, new, new.copygc = 0);
190
191                 if (bucket_unused(g)) {
192                         buckets_unused++;
193                         continue;
194                 }
195
196                 if (g->mark.owned_by_allocator ||
197                     g->mark.is_metadata)
198                         continue;
199
200                 sectors_used = bucket_sectors_used(g);
201
202                 if (sectors_used >= ca->mi.bucket_size)
203                         continue;
204
205                 bucket_heap_push(ca, g, sectors_used);
206         }
207
208         sectors_to_move = 0;
209         for (i = 0; i < ca->heap.used; i++)
210                 sectors_to_move += ca->heap.data[i].val;
211
212         while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
213                 BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp));
214                 sectors_to_move -= e.val;
215         }
216
217         for (i = 0; i < ca->heap.used; i++)
218                 bucket_cmpxchg(ca->heap.data[i].g, new, new.copygc = 1);
219
220         buckets_to_move = ca->heap.used;
221
222         mutex_unlock(&ca->set->bucket_lock);
223         mutex_unlock(&ca->heap_lock);
224         up_read(&c->gc_lock);
225
226         read_moving(ca, buckets_to_move, sectors_to_move);
227 }
228
229 static int bch_moving_gc_thread(void *arg)
230 {
231         struct cache *ca = arg;
232         struct cache_set *c = ca->set;
233         struct io_clock *clock = &c->io_clock[WRITE];
234         unsigned long last;
235         u64 available, want, next;
236
237         set_freezable();
238
239         while (!kthread_should_stop()) {
240                 if (kthread_wait_freezable(c->copy_gc_enabled))
241                         break;
242
243                 last = atomic_long_read(&clock->now);
244                 /*
245                  * don't start copygc until less than half the gc reserve is
246                  * available:
247                  */
248                 available = buckets_available_cache(ca);
249                 want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
250                                  c->opts.gc_reserve_percent, 200);
251                 if (available > want) {
252                         next = last + (available - want) *
253                                 ca->mi.bucket_size;
254                         bch_kthread_io_clock_wait(clock, next);
255                         continue;
256                 }
257
258                 bch_moving_gc(ca);
259         }
260
261         return 0;
262 }
263
264 void bch_moving_init_cache(struct cache *ca)
265 {
266         bch_pd_controller_init(&ca->moving_gc_pd);
267         ca->moving_gc_pd.d_term = 0;
268 }
269
270 int bch_moving_gc_thread_start(struct cache *ca)
271 {
272         struct task_struct *t;
273
274         /* The moving gc read thread must be stopped */
275         BUG_ON(ca->moving_gc_read != NULL);
276
277         if (cache_set_init_fault("moving_gc_start"))
278                 return -ENOMEM;
279
280         t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read");
281         if (IS_ERR(t))
282                 return PTR_ERR(t);
283
284         ca->moving_gc_read = t;
285         wake_up_process(ca->moving_gc_read);
286
287         return 0;
288 }
289
290 void bch_moving_gc_stop(struct cache *ca)
291 {
292         ca->moving_gc_pd.rate.rate = UINT_MAX;
293         bch_ratelimit_reset(&ca->moving_gc_pd.rate);
294
295         if (ca->moving_gc_read)
296                 kthread_stop(ca->moving_gc_read);
297         ca->moving_gc_read = NULL;
298 }