]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/rebalance.c
Update bcachefs sources to ed6b7f81a7 six locks: Disable percpu read lock mode in...
[bcachefs-tools-debian] / libbcachefs / rebalance.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "alloc_foreground.h"
5 #include "btree_iter.h"
6 #include "buckets.h"
7 #include "clock.h"
8 #include "disk_groups.h"
9 #include "errcode.h"
10 #include "extents.h"
11 #include "io.h"
12 #include "move.h"
13 #include "rebalance.h"
14 #include "super-io.h"
15 #include "trace.h"
16
17 #include <linux/freezer.h>
18 #include <linux/kthread.h>
19 #include <linux/sched/cputime.h>
20
21 /*
22  * Check if an extent should be moved:
23  * returns -1 if it should not be moved, or
24  * device of pointer that should be moved, if known, or INT_MAX if unknown
25  */
26 static bool rebalance_pred(struct bch_fs *c, void *arg,
27                            struct bkey_s_c k,
28                            struct bch_io_opts *io_opts,
29                            struct data_update_opts *data_opts)
30 {
31         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
32         unsigned i;
33
34         data_opts->rewrite_ptrs         = 0;
35         data_opts->target               = io_opts->background_target;
36         data_opts->extra_replicas       = 0;
37         data_opts->btree_insert_flags   = 0;
38
39         if (io_opts->background_compression &&
40             !bch2_bkey_is_incompressible(k)) {
41                 const union bch_extent_entry *entry;
42                 struct extent_ptr_decoded p;
43
44                 i = 0;
45                 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
46                         if (!p.ptr.cached &&
47                             p.crc.compression_type !=
48                             bch2_compression_opt_to_type[io_opts->background_compression])
49                                 data_opts->rewrite_ptrs |= 1U << i;
50                         i++;
51                 }
52         }
53
54         if (io_opts->background_target) {
55                 const struct bch_extent_ptr *ptr;
56
57                 i = 0;
58                 bkey_for_each_ptr(ptrs, ptr) {
59                         if (!ptr->cached &&
60                             !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
61                             bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
62                                 data_opts->rewrite_ptrs |= 1U << i;
63                         i++;
64                 }
65         }
66
67         return data_opts->rewrite_ptrs != 0;
68 }
69
70 void bch2_rebalance_add_key(struct bch_fs *c,
71                             struct bkey_s_c k,
72                             struct bch_io_opts *io_opts)
73 {
74         struct data_update_opts update_opts = { 0 };
75         struct bkey_ptrs_c ptrs;
76         const struct bch_extent_ptr *ptr;
77         unsigned i;
78
79         if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
80                 return;
81
82         i = 0;
83         ptrs = bch2_bkey_ptrs_c(k);
84         bkey_for_each_ptr(ptrs, ptr) {
85                 if ((1U << i) && update_opts.rewrite_ptrs)
86                         if (atomic64_add_return(k.k->size,
87                                         &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
88                             k.k->size)
89                                 rebalance_wakeup(c);
90                 i++;
91         }
92 }
93
94 void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
95 {
96         if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
97             sectors)
98                 rebalance_wakeup(c);
99 }
100
101 struct rebalance_work {
102         int             dev_most_full_idx;
103         unsigned        dev_most_full_percent;
104         u64             dev_most_full_work;
105         u64             dev_most_full_capacity;
106         u64             total_work;
107 };
108
109 static void rebalance_work_accumulate(struct rebalance_work *w,
110                 u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
111 {
112         unsigned percent_full;
113         u64 work = dev_work + unknown_dev;
114
115         if (work < dev_work || work < unknown_dev)
116                 work = U64_MAX;
117         work = min(work, capacity);
118
119         percent_full = div64_u64(work * 100, capacity);
120
121         if (percent_full >= w->dev_most_full_percent) {
122                 w->dev_most_full_idx            = idx;
123                 w->dev_most_full_percent        = percent_full;
124                 w->dev_most_full_work           = work;
125                 w->dev_most_full_capacity       = capacity;
126         }
127
128         if (w->total_work + dev_work >= w->total_work &&
129             w->total_work + dev_work >= dev_work)
130                 w->total_work += dev_work;
131 }
132
133 static struct rebalance_work rebalance_work(struct bch_fs *c)
134 {
135         struct bch_dev *ca;
136         struct rebalance_work ret = { .dev_most_full_idx = -1 };
137         u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
138         unsigned i;
139
140         for_each_online_member(ca, c, i)
141                 rebalance_work_accumulate(&ret,
142                         atomic64_read(&ca->rebalance_work),
143                         unknown_dev,
144                         bucket_to_sector(ca, ca->mi.nbuckets -
145                                          ca->mi.first_bucket),
146                         i);
147
148         rebalance_work_accumulate(&ret,
149                 unknown_dev, 0, c->capacity, -1);
150
151         return ret;
152 }
153
154 static void rebalance_work_reset(struct bch_fs *c)
155 {
156         struct bch_dev *ca;
157         unsigned i;
158
159         for_each_online_member(ca, c, i)
160                 atomic64_set(&ca->rebalance_work, 0);
161
162         atomic64_set(&c->rebalance.work_unknown_dev, 0);
163 }
164
165 static unsigned long curr_cputime(void)
166 {
167         u64 utime, stime;
168
169         task_cputime_adjusted(current, &utime, &stime);
170         return nsecs_to_jiffies(utime + stime);
171 }
172
173 static int bch2_rebalance_thread(void *arg)
174 {
175         struct bch_fs *c = arg;
176         struct bch_fs_rebalance *r = &c->rebalance;
177         struct io_clock *clock = &c->io_clock[WRITE];
178         struct rebalance_work w, p;
179         struct bch_move_stats move_stats;
180         unsigned long start, prev_start;
181         unsigned long prev_run_time, prev_run_cputime;
182         unsigned long cputime, prev_cputime;
183         u64 io_start;
184         long throttle;
185
186         set_freezable();
187
188         io_start        = atomic64_read(&clock->now);
189         p               = rebalance_work(c);
190         prev_start      = jiffies;
191         prev_cputime    = curr_cputime();
192
193         bch2_move_stats_init(&move_stats, "rebalance");
194         while (!kthread_wait_freezable(r->enabled)) {
195                 cond_resched();
196
197                 start                   = jiffies;
198                 cputime                 = curr_cputime();
199
200                 prev_run_time           = start - prev_start;
201                 prev_run_cputime        = cputime - prev_cputime;
202
203                 w                       = rebalance_work(c);
204                 BUG_ON(!w.dev_most_full_capacity);
205
206                 if (!w.total_work) {
207                         r->state = REBALANCE_WAITING;
208                         kthread_wait_freezable(rebalance_work(c).total_work);
209                         continue;
210                 }
211
212                 /*
213                  * If there isn't much work to do, throttle cpu usage:
214                  */
215                 throttle = prev_run_cputime * 100 /
216                         max(1U, w.dev_most_full_percent) -
217                         prev_run_time;
218
219                 if (w.dev_most_full_percent < 20 && throttle > 0) {
220                         r->throttled_until_iotime = io_start +
221                                 div_u64(w.dev_most_full_capacity *
222                                         (20 - w.dev_most_full_percent),
223                                         50);
224
225                         if (atomic64_read(&clock->now) + clock->max_slop <
226                             r->throttled_until_iotime) {
227                                 r->throttled_until_cputime = start + throttle;
228                                 r->state = REBALANCE_THROTTLED;
229
230                                 bch2_kthread_io_clock_wait(clock,
231                                         r->throttled_until_iotime,
232                                         throttle);
233                                 continue;
234                         }
235                 }
236
237                 /* minimum 1 mb/sec: */
238                 r->pd.rate.rate =
239                         max_t(u64, 1 << 11,
240                               r->pd.rate.rate *
241                               max(p.dev_most_full_percent, 1U) /
242                               max(w.dev_most_full_percent, 1U));
243
244                 io_start        = atomic64_read(&clock->now);
245                 p               = w;
246                 prev_start      = start;
247                 prev_cputime    = cputime;
248
249                 r->state = REBALANCE_RUNNING;
250                 memset(&move_stats, 0, sizeof(move_stats));
251                 rebalance_work_reset(c);
252
253                 bch2_move_data(c,
254                                0,               POS_MIN,
255                                BTREE_ID_NR,     POS_MAX,
256                                /* ratelimiting disabled for now */
257                                NULL, /*  &r->pd.rate, */
258                                &move_stats,
259                                writepoint_ptr(&c->rebalance_write_point),
260                                true,
261                                rebalance_pred, NULL);
262         }
263
264         return 0;
265 }
266
267 void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
268 {
269         struct bch_fs_rebalance *r = &c->rebalance;
270         struct rebalance_work w = rebalance_work(c);
271
272         if (!out->nr_tabstops)
273                 printbuf_tabstop_push(out, 20);
274
275         prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
276         prt_tab(out);
277
278         prt_human_readable_u64(out, w.dev_most_full_work << 9);
279         prt_printf(out, "/");
280         prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
281         prt_newline(out);
282
283         prt_printf(out, "total work:");
284         prt_tab(out);
285
286         prt_human_readable_u64(out, w.total_work << 9);
287         prt_printf(out, "/");
288         prt_human_readable_u64(out, c->capacity << 9);
289         prt_newline(out);
290
291         prt_printf(out, "rate:");
292         prt_tab(out);
293         prt_printf(out, "%u", r->pd.rate.rate);
294         prt_newline(out);
295
296         switch (r->state) {
297         case REBALANCE_WAITING:
298                 prt_printf(out, "waiting");
299                 break;
300         case REBALANCE_THROTTLED:
301                 prt_printf(out, "throttled for %lu sec or ",
302                        (r->throttled_until_cputime - jiffies) / HZ);
303                 prt_human_readable_u64(out,
304                             (r->throttled_until_iotime -
305                              atomic64_read(&c->io_clock[WRITE].now)) << 9);
306                 prt_printf(out, " io");
307                 break;
308         case REBALANCE_RUNNING:
309                 prt_printf(out, "running");
310                 break;
311         }
312         prt_newline(out);
313 }
314
315 void bch2_rebalance_stop(struct bch_fs *c)
316 {
317         struct task_struct *p;
318
319         c->rebalance.pd.rate.rate = UINT_MAX;
320         bch2_ratelimit_reset(&c->rebalance.pd.rate);
321
322         p = rcu_dereference_protected(c->rebalance.thread, 1);
323         c->rebalance.thread = NULL;
324
325         if (p) {
326                 /* for sychronizing with rebalance_wakeup() */
327                 synchronize_rcu();
328
329                 kthread_stop(p);
330                 put_task_struct(p);
331         }
332 }
333
334 int bch2_rebalance_start(struct bch_fs *c)
335 {
336         struct task_struct *p;
337         int ret;
338
339         if (c->rebalance.thread)
340                 return 0;
341
342         if (c->opts.nochanges)
343                 return 0;
344
345         p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
346         ret = PTR_ERR_OR_ZERO(p);
347         if (ret) {
348                 bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
349                 return ret;
350         }
351
352         get_task_struct(p);
353         rcu_assign_pointer(c->rebalance.thread, p);
354         wake_up_process(p);
355         return 0;
356 }
357
358 void bch2_fs_rebalance_init(struct bch_fs *c)
359 {
360         bch2_pd_controller_init(&c->rebalance.pd);
361
362         atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
363 }