1 // SPDX-License-Identifier: GPL-2.0
5 #include "journal_io.h"
6 #include "journal_reclaim.h"
10 /* Free space calculations: */
12 static unsigned journal_space_from(struct journal_device *ja,
13 enum journal_space_from from)
16 case journal_space_discarded:
17 return ja->discard_idx;
18 case journal_space_clean_ondisk:
19 return ja->dirty_idx_ondisk;
20 case journal_space_clean:
27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
28 struct journal_device *ja,
29 enum journal_space_from from)
31 unsigned available = (journal_space_from(ja, from) -
32 ja->cur_idx - 1 + ja->nr) % ja->nr;
35 * Don't use the last bucket unless writing the new last_seq
36 * will make another bucket available:
38 if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
44 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
46 union journal_preres_state old, new;
47 u64 v = atomic64_read(&j->prereserved.counter);
51 new.remaining = u64s_remaining;
52 } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
53 old.v, new.v)) != old.v);
56 static struct journal_space {
59 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
60 enum journal_space_from from)
62 struct bch_fs *c = container_of(j, struct bch_fs, journal);
64 unsigned sectors_next_entry = UINT_MAX;
65 unsigned sectors_total = UINT_MAX;
66 unsigned i, nr_devs = 0;
67 unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
68 ? journal_prev_buf(j)->sectors
72 for_each_member_device_rcu(ca, c, i,
73 &c->rw_devs[BCH_DATA_journal]) {
74 struct journal_device *ja = &ca->journal;
75 unsigned buckets_this_device, sectors_this_device;
80 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
81 sectors_this_device = ja->sectors_free;
84 * We that we don't allocate the space for a journal entry
85 * until we write it out - thus, account for it here:
87 if (unwritten_sectors >= sectors_this_device) {
88 if (!buckets_this_device)
91 buckets_this_device--;
92 sectors_this_device = ca->mi.bucket_size;
95 sectors_this_device -= unwritten_sectors;
97 if (sectors_this_device < ca->mi.bucket_size &&
98 buckets_this_device) {
99 buckets_this_device--;
100 sectors_this_device = ca->mi.bucket_size;
103 if (!sectors_this_device)
106 sectors_next_entry = min(sectors_next_entry,
107 sectors_this_device);
109 sectors_total = min(sectors_total,
110 buckets_this_device * ca->mi.bucket_size +
111 sectors_this_device);
117 if (nr_devs < nr_devs_want)
118 return (struct journal_space) { 0, 0 };
120 return (struct journal_space) {
121 .next_entry = sectors_next_entry,
122 .remaining = max_t(int, 0, sectors_total - sectors_next_entry),
126 void bch2_journal_space_available(struct journal *j)
128 struct bch_fs *c = container_of(j, struct bch_fs, journal);
130 struct journal_space discarded, clean_ondisk, clean;
131 unsigned overhead, u64s_remaining = 0;
132 unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
133 j->buf[1].buf_size >> 9);
134 unsigned i, nr_online = 0, nr_devs_want;
135 bool can_discard = false;
138 lockdep_assert_held(&j->lock);
141 for_each_member_device_rcu(ca, c, i,
142 &c->rw_devs[BCH_DATA_journal]) {
143 struct journal_device *ja = &ca->journal;
148 while (ja->dirty_idx != ja->cur_idx &&
149 ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
150 ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
152 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
153 ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
154 ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
156 if (ja->discard_idx != ja->dirty_idx_ondisk)
159 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
164 j->can_discard = can_discard;
166 if (nr_online < c->opts.metadata_replicas_required) {
171 if (!fifo_free(&j->pin)) {
176 nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
178 discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
179 clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
180 clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
182 if (!discarded.next_entry)
185 overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
186 journal_entry_overhead(j);
187 u64s_remaining = clean.remaining << 6;
188 u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
191 j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
192 j->cur_entry_error = ret;
193 journal_set_remaining(j, u64s_remaining);
194 journal_check_may_get_unreserved(j);
200 /* Discards - last part of journal reclaim: */
202 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
207 ret = ja->discard_idx != ja->dirty_idx_ondisk;
208 spin_unlock(&j->lock);
214 * Advance ja->discard_idx as long as it points to buckets that are no longer
215 * dirty, issuing discards if necessary:
217 void bch2_journal_do_discards(struct journal *j)
219 struct bch_fs *c = container_of(j, struct bch_fs, journal);
223 mutex_lock(&j->discard_lock);
225 for_each_rw_member(ca, c, iter) {
226 struct journal_device *ja = &ca->journal;
228 while (should_discard_bucket(j, ja)) {
229 if (ca->mi.discard &&
230 blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
231 blkdev_issue_discard(ca->disk_sb.bdev,
233 ja->buckets[ja->discard_idx]),
234 ca->mi.bucket_size, GFP_NOIO, 0);
237 ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
239 bch2_journal_space_available(j);
240 spin_unlock(&j->lock);
244 mutex_unlock(&j->discard_lock);
248 * Journal entry pinning - machinery for holding a reference on a given journal
249 * entry, holding it open to ensure it gets replayed during recovery:
252 static void bch2_journal_reclaim_fast(struct journal *j)
254 struct journal_entry_pin_list temp;
257 lockdep_assert_held(&j->lock);
260 * Unpin journal entries whose reference counts reached zero, meaning
261 * all btree nodes got written out
263 while (!fifo_empty(&j->pin) &&
264 !atomic_read(&fifo_peek_front(&j->pin).count)) {
265 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
266 BUG_ON(!fifo_pop(&j->pin, temp));
271 bch2_journal_space_available(j);
274 void bch2_journal_pin_put(struct journal *j, u64 seq)
276 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
278 if (atomic_dec_and_test(&pin_list->count)) {
280 bch2_journal_reclaim_fast(j);
281 spin_unlock(&j->lock);
285 static inline void __journal_pin_drop(struct journal *j,
286 struct journal_entry_pin *pin)
288 struct journal_entry_pin_list *pin_list;
290 if (!journal_pin_active(pin))
293 pin_list = journal_seq_pin(j, pin->seq);
295 list_del_init(&pin->list);
298 * Unpinning a journal entry make make journal_next_bucket() succeed, if
299 * writing a new last_seq will now make another bucket available:
301 if (atomic_dec_and_test(&pin_list->count) &&
302 pin_list == &fifo_peek_front(&j->pin))
303 bch2_journal_reclaim_fast(j);
304 else if (fifo_used(&j->pin) == 1 &&
305 atomic_read(&pin_list->count) == 1)
309 void bch2_journal_pin_drop(struct journal *j,
310 struct journal_entry_pin *pin)
313 __journal_pin_drop(j, pin);
314 spin_unlock(&j->lock);
317 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
318 struct journal_entry_pin *pin,
319 journal_pin_flush_fn flush_fn)
321 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
323 __journal_pin_drop(j, pin);
325 BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
327 atomic_inc(&pin_list->count);
329 pin->flush = flush_fn;
331 list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
334 void __bch2_journal_pin_add(struct journal *j, u64 seq,
335 struct journal_entry_pin *pin,
336 journal_pin_flush_fn flush_fn)
339 bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
340 spin_unlock(&j->lock);
343 * If the journal is currently full, we might want to call flush_fn
349 void bch2_journal_pin_update(struct journal *j, u64 seq,
350 struct journal_entry_pin *pin,
351 journal_pin_flush_fn flush_fn)
353 if (journal_pin_active(pin) && pin->seq < seq)
358 if (pin->seq != seq) {
359 bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
361 struct journal_entry_pin_list *pin_list =
362 journal_seq_pin(j, seq);
365 * If the pin is already pinning the right sequence number, it
366 * still might've already been flushed:
368 list_move(&pin->list, &pin_list->list);
371 spin_unlock(&j->lock);
374 * If the journal is currently full, we might want to call flush_fn
380 void bch2_journal_pin_copy(struct journal *j,
381 struct journal_entry_pin *dst,
382 struct journal_entry_pin *src,
383 journal_pin_flush_fn flush_fn)
387 if (journal_pin_active(src) &&
388 (!journal_pin_active(dst) || src->seq < dst->seq))
389 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
391 spin_unlock(&j->lock);
395 * bch2_journal_pin_flush: ensure journal pin callback is no longer running
397 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
399 BUG_ON(journal_pin_active(pin));
401 wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
405 * Journal reclaim: flush references to open journal entries to reclaim space in
408 * May be done by the journal code in the background as needed to free up space
409 * for more journal entries, or as part of doing a clean shutdown, or to migrate
410 * data off of a specific device:
413 static struct journal_entry_pin *
414 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
416 struct journal_entry_pin_list *pin_list;
417 struct journal_entry_pin *ret = NULL;
419 if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
424 fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
425 if (*seq > max_seq ||
426 (ret = list_first_entry_or_null(&pin_list->list,
427 struct journal_entry_pin, list)))
431 list_move(&ret->list, &pin_list->flushed);
432 BUG_ON(j->flush_in_progress);
433 j->flush_in_progress = ret;
434 j->last_flushed = jiffies;
437 spin_unlock(&j->lock);
442 /* returns true if we did work */
443 static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
446 struct journal_entry_pin *pin;
450 lockdep_assert_held(&j->reclaim_lock);
452 while ((pin = journal_get_next_pin(j, min_nr
453 ? U64_MAX : seq_to_flush, &seq))) {
457 pin->flush(j, pin, seq);
459 BUG_ON(j->flush_in_progress != pin);
460 j->flush_in_progress = NULL;
461 wake_up(&j->pin_flush_wait);
469 * bch2_journal_reclaim - free up journal buckets
471 * Background journal reclaim writes out btree nodes. It should be run
472 * early enough so that we never completely run out of journal buckets.
474 * High watermarks for triggering background reclaim:
475 * - FIFO has fewer than 512 entries left
476 * - fewer than 25% journal buckets free
478 * Background reclaim runs until low watermarks are reached:
479 * - FIFO has more than 1024 entries left
480 * - more than 50% journal buckets free
482 * As long as a reclaim can complete in the time it takes to fill up
483 * 512 journal entries or 25% of all journal buckets, then
484 * journal_next_bucket() should not stall.
486 void bch2_journal_reclaim(struct journal *j)
488 struct bch_fs *c = container_of(j, struct bch_fs, journal);
490 unsigned iter, min_nr = 0;
491 u64 seq_to_flush = 0;
493 lockdep_assert_held(&j->reclaim_lock);
495 bch2_journal_do_discards(j);
499 for_each_rw_member(ca, c, iter) {
500 struct journal_device *ja = &ca->journal;
501 unsigned nr_buckets, bucket_to_flush;
506 /* Try to keep the journal at most half full: */
507 nr_buckets = ja->nr / 2;
509 /* And include pre-reservations: */
510 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
511 (ca->mi.bucket_size << 6) -
512 journal_entry_overhead(j));
514 nr_buckets = min(nr_buckets, ja->nr);
516 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
517 seq_to_flush = max(seq_to_flush,
518 ja->bucket_seq[bucket_to_flush]);
521 /* Also flush if the pin fifo is more than half full */
522 seq_to_flush = max_t(s64, seq_to_flush,
523 (s64) journal_cur_seq(j) -
525 spin_unlock(&j->lock);
528 * If it's been longer than j->reclaim_delay_ms since we last flushed,
529 * make sure to flush at least one journal pin:
531 if (time_after(jiffies, j->last_flushed +
532 msecs_to_jiffies(j->reclaim_delay_ms)))
535 if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
536 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
540 journal_flush_pins(j, seq_to_flush, min_nr);
542 if (!bch2_journal_error(j))
543 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
544 msecs_to_jiffies(j->reclaim_delay_ms));
547 void bch2_journal_reclaim_work(struct work_struct *work)
549 struct journal *j = container_of(to_delayed_work(work),
550 struct journal, reclaim_work);
552 mutex_lock(&j->reclaim_lock);
553 bch2_journal_reclaim(j);
554 mutex_unlock(&j->reclaim_lock);
557 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
562 ret = bch2_journal_error(j);
566 mutex_lock(&j->reclaim_lock);
568 *did_work = journal_flush_pins(j, seq_to_flush, 0);
572 * If journal replay hasn't completed, the unreplayed journal entries
573 * hold refs on their corresponding sequence numbers
575 ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
576 journal_last_seq(j) > seq_to_flush ||
577 (fifo_used(&j->pin) == 1 &&
578 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
580 spin_unlock(&j->lock);
581 mutex_unlock(&j->reclaim_lock);
586 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
588 bool did_work = false;
590 if (!test_bit(JOURNAL_STARTED, &j->flags))
593 closure_wait_event(&j->async_wait,
594 journal_flush_done(j, seq_to_flush, &did_work));
599 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
601 struct bch_fs *c = container_of(j, struct bch_fs, journal);
602 struct journal_entry_pin_list *p;
607 fifo_for_each_entry_ptr(p, &j->pin, iter)
609 ? bch2_dev_list_has_dev(p->devs, dev_idx)
610 : p->devs.nr < c->opts.metadata_replicas)
612 spin_unlock(&j->lock);
614 bch2_journal_flush_pins(j, seq);
616 ret = bch2_journal_error(j);
620 mutex_lock(&c->replicas_gc_lock);
621 bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
626 while (!ret && seq < j->pin.back) {
627 struct bch_replicas_padded replicas;
629 seq = max(seq, journal_last_seq(j));
630 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
631 journal_seq_pin(j, seq)->devs);
634 spin_unlock(&j->lock);
635 ret = bch2_mark_replicas(c, &replicas.e);
638 spin_unlock(&j->lock);
640 ret = bch2_replicas_gc_end(c, ret);
641 mutex_unlock(&c->replicas_gc_lock);