2 * background writeback - scan btree for dirty data and write it to the backing
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
10 #include "btree_update.h"
18 #include "writeback.h"
20 #include <linux/delay.h>
21 #include <linux/freezer.h>
22 #include <linux/kthread.h>
23 #include <trace/events/bcache.h>
27 static void __update_writeback_rate(struct cached_dev *dc)
29 struct cache_set *c = dc->disk.c;
30 u64 cache_dirty_target =
31 div_u64(c->capacity * dc->writeback_percent, 100);
32 s64 target = div64_u64(cache_dirty_target *
33 bdev_sectors(dc->disk_sb.bdev),
34 c->cached_dev_sectors);
35 s64 dirty = bcache_dev_sectors_dirty(&dc->disk);
37 bch_pd_controller_update(&dc->writeback_pd, target << 9,
41 static void update_writeback_rate(struct work_struct *work)
43 struct cached_dev *dc = container_of(to_delayed_work(work),
47 down_read(&dc->writeback_lock);
49 if (atomic_read(&dc->has_dirty) &&
50 dc->writeback_percent &&
51 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
52 __update_writeback_rate(dc);
54 dc->writeback_pd.rate.rate = UINT_MAX;
56 up_read(&dc->writeback_lock);
58 schedule_delayed_work(&dc->writeback_pd_update,
59 dc->writeback_pd_update_seconds * HZ);
64 struct bch_replace_info replace;
65 struct cached_dev *dc;
68 struct bch_extent_ptr ptr;
75 #define DIRTY_IO_MEMPOOL_BVECS 64
76 #define DIRTY_IO_MEMPOOL_SECTORS (DIRTY_IO_MEMPOOL_BVECS * PAGE_SECTORS)
78 static void dirty_init(struct dirty_io *io)
80 struct bio *bio = &io->bio;
83 if (!io->dc->writeback_percent)
84 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
86 bio->bi_iter.bi_size = io->replace.key.k.size << 9;
88 DIV_ROUND_UP(io->replace.key.k.size, PAGE_SECTORS);
89 bio->bi_io_vec = bio->bi_inline_vecs;
90 bch_bio_map(bio, NULL);
93 static void dirty_io_destructor(struct closure *cl)
95 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
98 mempool_free(io, &io->dc->writeback_io_pool);
103 static void write_dirty_finish(struct closure *cl)
105 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
106 struct cached_dev *dc = io->dc;
110 bio_for_each_segment_all(bv, &io->bio, i)
111 mempool_free(bv->bv_page, &dc->writeback_page_pool);
117 bkey_copy(&tmp.k, &io->replace.key);
118 io->replace.hook.fn = bch_extent_cmpxchg;
119 bkey_extent_set_cached(&tmp.k.k, true);
121 ret = bch_btree_insert(dc->disk.c, BTREE_ID_EXTENTS, &tmp.k,
122 NULL, &io->replace.hook, NULL, 0);
123 if (io->replace.successes == 0)
124 trace_bcache_writeback_collision(&io->replace.key.k);
127 ? &dc->disk.c->writeback_keys_failed
128 : &dc->disk.c->writeback_keys_done);
131 bch_keybuf_put(&dc->writeback_keys, io->w);
133 closure_return_with_destructor(cl, dirty_io_destructor);
136 static void dirty_endio(struct bio *bio)
138 struct dirty_io *io = container_of(bio, struct dirty_io, bio);
141 trace_bcache_writeback_error(&io->replace.key.k,
142 op_is_write(bio_op(&io->bio)),
144 io->error = bio->bi_error;
147 closure_put(&io->cl);
150 static void write_dirty(struct closure *cl)
152 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
156 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
157 io->bio.bi_iter.bi_sector =
158 bkey_start_offset(&io->replace.key.k);
159 io->bio.bi_bdev = io->dc->disk_sb.bdev;
160 io->bio.bi_end_io = dirty_endio;
162 closure_bio_submit(&io->bio, cl);
165 continue_at(cl, write_dirty_finish, io->dc->disk.c->wq);
168 static void read_dirty_endio(struct bio *bio)
170 struct dirty_io *io = container_of(bio, struct dirty_io, bio);
172 bch_dev_nonfatal_io_err_on(bio->bi_error, io->ca, "writeback read");
174 bch_account_io_completion(io->ca);
176 if (ptr_stale(io->ca, &io->ptr))
177 bio->bi_error = -EINTR;
182 static void read_dirty_submit(struct closure *cl)
184 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
186 closure_bio_submit(&io->bio, cl);
188 continue_at(cl, write_dirty, system_freezable_wq);
191 static u64 read_dirty(struct cached_dev *dc)
193 struct keybuf_key *w;
198 u64 sectors_written = 0;
201 closure_init_stack(&cl);
203 while (!bch_ratelimit_wait_freezable_stoppable(&dc->writeback_pd.rate)) {
204 w = bch_keybuf_next(&dc->writeback_keys);
208 sectors_written += w->key.k.size;
209 bkey_copy(&tmp.k, &w->key);
211 while (tmp.k.k.size) {
212 struct extent_pick_ptr pick;
214 bch_extent_pick_ptr(dc->disk.c,
215 bkey_i_to_s_c(&tmp.k),
217 if (IS_ERR_OR_NULL(pick.ca))
220 io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) *
221 DIV_ROUND_UP(tmp.k.k.size,
225 trace_bcache_writeback_alloc_fail(pick.ca->set,
227 io = mempool_alloc(&dc->writeback_io_pool,
229 memset(io, 0, sizeof(*io) +
230 sizeof(struct bio_vec) *
231 DIRTY_IO_MEMPOOL_BVECS);
232 io->from_mempool = true;
234 bkey_copy(&io->replace.key, &tmp.k);
236 if (DIRTY_IO_MEMPOOL_SECTORS <
237 io->replace.key.k.size)
238 bch_key_resize(&io->replace.key.k,
239 DIRTY_IO_MEMPOOL_SECTORS);
241 bkey_copy(&io->replace.key, &tmp.k);
251 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
252 io->bio.bi_iter.bi_sector = pick.ptr.offset;
253 io->bio.bi_bdev = pick.ca->disk_sb.bdev;
254 io->bio.bi_end_io = read_dirty_endio;
256 bio_for_each_segment_all(bv, &io->bio, i) {
258 mempool_alloc(&dc->writeback_page_pool,
265 io->bio.bi_iter.bi_size =
266 io->bio.bi_vcnt * PAGE_SIZE;
268 bch_key_resize(&io->replace.key.k,
269 bio_sectors(&io->bio));
274 bch_cut_front(io->replace.key.k.p, &tmp.k);
275 trace_bcache_writeback(&io->replace.key.k);
277 bch_ratelimit_increment(&dc->writeback_pd.rate,
278 io->replace.key.k.size << 9);
280 closure_call(&io->cl, read_dirty_submit, NULL, &cl);
283 bch_keybuf_put(&dc->writeback_keys, w);
287 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
288 * freed) before refilling again
292 return sectors_written;
295 /* Scan for dirty data */
297 static void __bcache_dev_sectors_dirty_add(struct bcache_device *d,
298 u64 offset, int nr_sectors)
300 unsigned stripe_offset, stripe, sectors_dirty;
305 if (!d->stripe_sectors_dirty)
308 stripe = offset_to_stripe(d, offset);
309 stripe_offset = offset & (d->stripe_size - 1);
312 int s = min_t(unsigned, abs(nr_sectors),
313 d->stripe_size - stripe_offset);
318 if (stripe >= d->nr_stripes)
321 sectors_dirty = atomic_add_return(s,
322 d->stripe_sectors_dirty + stripe);
323 if (sectors_dirty == d->stripe_size)
324 set_bit(stripe, d->full_dirty_stripes);
326 clear_bit(stripe, d->full_dirty_stripes);
334 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
335 u64 offset, int nr_sectors)
337 struct bcache_device *d;
340 d = bch_dev_find(c, inode);
342 __bcache_dev_sectors_dirty_add(d, offset, nr_sectors);
346 static bool dirty_pred(struct keybuf *buf, struct bkey_s_c k)
348 struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
350 BUG_ON(k.k->p.inode != bcache_dev_inum(&dc->disk));
352 return bkey_extent_is_data(k.k) &&
353 !bkey_extent_is_cached(k.k);
356 static void refill_full_stripes(struct cached_dev *dc)
358 struct keybuf *buf = &dc->writeback_keys;
359 unsigned inode = bcache_dev_inum(&dc->disk);
360 unsigned start_stripe, stripe, next_stripe;
361 bool wrapped = false;
363 stripe = offset_to_stripe(&dc->disk, buf->last_scanned.offset);
365 if (stripe >= dc->disk.nr_stripes)
368 start_stripe = stripe;
371 stripe = find_next_bit(dc->disk.full_dirty_stripes,
372 dc->disk.nr_stripes, stripe);
374 if (stripe == dc->disk.nr_stripes)
377 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
378 dc->disk.nr_stripes, stripe);
380 buf->last_scanned = POS(inode,
381 stripe * dc->disk.stripe_size);
383 bch_refill_keybuf(dc->disk.c, buf,
385 next_stripe * dc->disk.stripe_size),
388 if (array_freelist_empty(&buf->freelist))
391 stripe = next_stripe;
393 if (wrapped && stripe > start_stripe)
396 if (stripe == dc->disk.nr_stripes) {
403 static u64 bch_writeback(struct cached_dev *dc)
405 struct keybuf *buf = &dc->writeback_keys;
406 unsigned inode = bcache_dev_inum(&dc->disk);
407 struct bpos start = POS(inode, 0);
408 struct bpos end = POS(inode, KEY_OFFSET_MAX);
409 struct bpos start_pos;
410 u64 sectors_written = 0;
412 buf->last_scanned = POS(inode, 0);
414 while (bkey_cmp(buf->last_scanned, end) < 0 &&
415 !kthread_should_stop()) {
416 down_write(&dc->writeback_lock);
418 if (!atomic_read(&dc->has_dirty)) {
419 up_write(&dc->writeback_lock);
420 set_current_state(TASK_INTERRUPTIBLE);
422 if (kthread_should_stop())
423 return sectors_written;
427 return sectors_written;
430 if (bkey_cmp(buf->last_scanned, end) >= 0)
431 buf->last_scanned = POS(inode, 0);
433 if (dc->partial_stripes_expensive) {
434 refill_full_stripes(dc);
435 if (array_freelist_empty(&buf->freelist))
439 start_pos = buf->last_scanned;
440 bch_refill_keybuf(dc->disk.c, buf, end, dirty_pred);
442 if (bkey_cmp(buf->last_scanned, end) >= 0) {
444 * If we get to the end start scanning again from the
445 * beginning, and only scan up to where we initially
446 * started scanning from:
448 buf->last_scanned = start;
449 bch_refill_keybuf(dc->disk.c, buf, start_pos,
453 if (RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
454 atomic_set(&dc->has_dirty, 0);
456 SET_BDEV_STATE(dc->disk_sb.sb, BDEV_STATE_CLEAN);
457 bch_write_bdev_super(dc, NULL);
461 up_write(&dc->writeback_lock);
463 bch_ratelimit_reset(&dc->writeback_pd.rate);
464 sectors_written += read_dirty(dc);
467 return sectors_written;
470 static int bch_writeback_thread(void *arg)
472 struct cached_dev *dc = arg;
473 struct cache_set *c = dc->disk.c;
474 struct io_clock *clock = &c->io_clock[WRITE];
480 while (!kthread_should_stop()) {
481 if (kthread_wait_freezable(dc->writeback_running ||
482 test_bit(BCACHE_DEV_DETACHING,
486 last = atomic_long_read(&clock->now);
488 sectors_written = bch_writeback(dc);
490 if (sectors_written < c->capacity >> 4)
491 bch_kthread_io_clock_wait(clock,
492 last + (c->capacity >> 5));
499 * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from writeback keys
501 * This prevents us from wrapping around gens for a bucket only referenced from
502 * writeback keybufs. We don't actually care that the data in those buckets is
503 * marked live, only that we don't wrap the gens.
505 void bch_writeback_recalc_oldest_gens(struct cache_set *c)
507 struct radix_tree_iter iter;
512 radix_tree_for_each_slot(slot, &c->devices, &iter, 0) {
513 struct bcache_device *d;
514 struct cached_dev *dc;
516 d = radix_tree_deref_slot(slot);
518 if (!CACHED_DEV(&d->inode.v))
520 dc = container_of(d, struct cached_dev, disk);
522 bch_keybuf_recalc_oldest_gens(c, &dc->writeback_keys);
530 void bch_sectors_dirty_init(struct cached_dev *dc, struct cache_set *c)
532 struct bcache_device *d = &dc->disk;
533 struct btree_iter iter;
537 * We have to do this before the disk is added to the radix tree or we
538 * race with moving GC
540 for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
541 POS(bcache_dev_inum(d), 0), k) {
542 if (k.k->p.inode > bcache_dev_inum(d))
545 if (bkey_extent_is_data(k.k) &&
546 !bkey_extent_is_cached(k.k))
547 __bcache_dev_sectors_dirty_add(d,
548 bkey_start_offset(k.k),
551 bch_btree_iter_cond_resched(&iter);
553 bch_btree_iter_unlock(&iter);
555 dc->writeback_pd.last_actual = bcache_dev_sectors_dirty(d);
558 void bch_cached_dev_writeback_stop(struct cached_dev *dc)
560 cancel_delayed_work_sync(&dc->writeback_pd_update);
561 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
562 kthread_stop(dc->writeback_thread);
563 dc->writeback_thread = NULL;
567 void bch_cached_dev_writeback_free(struct cached_dev *dc)
569 struct bcache_device *d = &dc->disk;
571 mempool_exit(&dc->writeback_page_pool);
572 mempool_exit(&dc->writeback_io_pool);
573 kvfree(d->full_dirty_stripes);
574 kvfree(d->stripe_sectors_dirty);
577 int bch_cached_dev_writeback_init(struct cached_dev *dc)
579 struct bcache_device *d = &dc->disk;
583 sectors = get_capacity(dc->disk.disk);
585 if (!d->stripe_size) {
586 #ifdef CONFIG_BCACHE_DEBUG
587 d->stripe_size = 1 << 0;
589 d->stripe_size = 1 << 31;
593 pr_debug("stripe size: %d sectors", d->stripe_size);
594 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
596 if (!d->nr_stripes ||
597 d->nr_stripes > INT_MAX ||
598 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
599 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
600 (unsigned)d->nr_stripes);
604 n = d->nr_stripes * sizeof(atomic_t);
605 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
606 ? kzalloc(n, GFP_KERNEL)
608 if (!d->stripe_sectors_dirty) {
609 pr_err("cannot allocate stripe_sectors_dirty");
613 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
614 d->full_dirty_stripes = n < PAGE_SIZE << 6
615 ? kzalloc(n, GFP_KERNEL)
617 if (!d->full_dirty_stripes) {
618 pr_err("cannot allocate full_dirty_stripes");
622 if (mempool_init_kmalloc_pool(&dc->writeback_io_pool, 4,
623 sizeof(struct dirty_io) +
624 sizeof(struct bio_vec) *
625 DIRTY_IO_MEMPOOL_BVECS) ||
626 mempool_init_page_pool(&dc->writeback_page_pool,
627 (64 << 10) / PAGE_SIZE, 0))
630 init_rwsem(&dc->writeback_lock);
631 bch_keybuf_init(&dc->writeback_keys);
633 dc->writeback_metadata = true;
634 dc->writeback_running = true;
635 dc->writeback_percent = 10;
636 dc->writeback_pd_update_seconds = 5;
638 bch_pd_controller_init(&dc->writeback_pd);
639 INIT_DELAYED_WORK(&dc->writeback_pd_update, update_writeback_rate);
644 int bch_cached_dev_writeback_start(struct cached_dev *dc)
646 dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
648 if (IS_ERR(dc->writeback_thread))
649 return PTR_ERR(dc->writeback_thread);
651 schedule_delayed_work(&dc->writeback_pd_update,
652 dc->writeback_pd_update_seconds * HZ);
654 bch_writeback_queue(dc);