1 // SPDX-License-Identifier: GPL-2.0
5 #include "journal_io.h"
6 #include "journal_reclaim.h"
10 /* Free space calculations: */
12 static unsigned journal_space_from(struct journal_device *ja,
13 enum journal_space_from from)
16 case journal_space_discarded:
17 return ja->discard_idx;
18 case journal_space_clean_ondisk:
19 return ja->dirty_idx_ondisk;
20 case journal_space_clean:
27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
28 struct journal_device *ja,
29 enum journal_space_from from)
31 struct bch_fs *c = container_of(j, struct bch_fs, journal);
32 unsigned available = (journal_space_from(ja, from) -
33 ja->cur_idx - 1 + ja->nr) % ja->nr;
36 * Allocator startup needs some journal space before we can do journal
39 if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
43 * Don't use the last bucket unless writing the new last_seq
44 * will make another bucket available:
46 if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
52 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
54 union journal_preres_state old, new;
55 u64 v = atomic64_read(&j->prereserved.counter);
59 new.remaining = u64s_remaining;
60 } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
61 old.v, new.v)) != old.v);
64 static struct journal_space {
67 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
68 enum journal_space_from from)
70 struct bch_fs *c = container_of(j, struct bch_fs, journal);
72 unsigned sectors_next_entry = UINT_MAX;
73 unsigned sectors_total = UINT_MAX;
74 unsigned i, nr_devs = 0;
75 unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
76 ? journal_prev_buf(j)->sectors
80 for_each_member_device_rcu(ca, c, i,
81 &c->rw_devs[BCH_DATA_JOURNAL]) {
82 struct journal_device *ja = &ca->journal;
83 unsigned buckets_this_device, sectors_this_device;
88 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
89 sectors_this_device = ja->sectors_free;
92 * We that we don't allocate the space for a journal entry
93 * until we write it out - thus, account for it here:
95 if (unwritten_sectors >= sectors_this_device) {
96 if (!buckets_this_device)
99 buckets_this_device--;
100 sectors_this_device = ca->mi.bucket_size;
103 sectors_this_device -= unwritten_sectors;
105 if (sectors_this_device < ca->mi.bucket_size &&
106 buckets_this_device) {
107 buckets_this_device--;
108 sectors_this_device = ca->mi.bucket_size;
111 if (!sectors_this_device)
114 sectors_next_entry = min(sectors_next_entry,
115 sectors_this_device);
117 sectors_total = min(sectors_total,
118 buckets_this_device * ca->mi.bucket_size +
119 sectors_this_device);
125 if (nr_devs < nr_devs_want)
126 return (struct journal_space) { 0, 0 };
128 return (struct journal_space) {
129 .next_entry = sectors_next_entry,
130 .remaining = max_t(int, 0, sectors_total - sectors_next_entry),
134 void bch2_journal_space_available(struct journal *j)
136 struct bch_fs *c = container_of(j, struct bch_fs, journal);
138 struct journal_space discarded, clean_ondisk, clean;
139 unsigned overhead, u64s_remaining = 0;
140 unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
141 j->buf[1].buf_size >> 9);
142 unsigned i, nr_online = 0, nr_devs_want;
143 bool can_discard = false;
146 lockdep_assert_held(&j->lock);
149 for_each_member_device_rcu(ca, c, i,
150 &c->rw_devs[BCH_DATA_JOURNAL]) {
151 struct journal_device *ja = &ca->journal;
156 while (ja->dirty_idx != ja->cur_idx &&
157 ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
158 ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
160 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
161 ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
162 ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
164 if (ja->discard_idx != ja->dirty_idx_ondisk)
167 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
172 j->can_discard = can_discard;
174 if (nr_online < c->opts.metadata_replicas_required) {
179 if (!fifo_free(&j->pin)) {
184 nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
186 discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded);
187 clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
188 clean = __journal_space_available(j, nr_devs_want, journal_space_clean);
190 if (!discarded.next_entry)
193 overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
194 journal_entry_overhead(j);
195 u64s_remaining = clean.remaining << 6;
196 u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
199 j->cur_entry_sectors = !ret ? discarded.next_entry : 0;
200 j->cur_entry_error = ret;
201 journal_set_remaining(j, u64s_remaining);
202 journal_check_may_get_unreserved(j);
208 /* Discards - last part of journal reclaim: */
210 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
215 ret = ja->discard_idx != ja->dirty_idx_ondisk;
216 spin_unlock(&j->lock);
222 * Advance ja->discard_idx as long as it points to buckets that are no longer
223 * dirty, issuing discards if necessary:
225 void bch2_journal_do_discards(struct journal *j)
227 struct bch_fs *c = container_of(j, struct bch_fs, journal);
231 mutex_lock(&j->discard_lock);
233 for_each_rw_member(ca, c, iter) {
234 struct journal_device *ja = &ca->journal;
236 while (should_discard_bucket(j, ja)) {
237 if (ca->mi.discard &&
238 blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
239 blkdev_issue_discard(ca->disk_sb.bdev,
241 ja->buckets[ja->discard_idx]),
242 ca->mi.bucket_size, GFP_NOIO, 0);
245 ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
247 bch2_journal_space_available(j);
248 spin_unlock(&j->lock);
252 mutex_unlock(&j->discard_lock);
256 * Journal entry pinning - machinery for holding a reference on a given journal
257 * entry, holding it open to ensure it gets replayed during recovery:
260 static void bch2_journal_reclaim_fast(struct journal *j)
262 struct journal_entry_pin_list temp;
265 lockdep_assert_held(&j->lock);
268 * Unpin journal entries whose reference counts reached zero, meaning
269 * all btree nodes got written out
271 while (!fifo_empty(&j->pin) &&
272 !atomic_read(&fifo_peek_front(&j->pin).count)) {
273 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
274 BUG_ON(!fifo_pop(&j->pin, temp));
279 bch2_journal_space_available(j);
282 void bch2_journal_pin_put(struct journal *j, u64 seq)
284 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
286 if (atomic_dec_and_test(&pin_list->count)) {
288 bch2_journal_reclaim_fast(j);
289 spin_unlock(&j->lock);
293 static inline void __journal_pin_drop(struct journal *j,
294 struct journal_entry_pin *pin)
296 struct journal_entry_pin_list *pin_list;
298 if (!journal_pin_active(pin))
301 pin_list = journal_seq_pin(j, pin->seq);
303 list_del_init(&pin->list);
306 * Unpinning a journal entry make make journal_next_bucket() succeed, if
307 * writing a new last_seq will now make another bucket available:
309 if (atomic_dec_and_test(&pin_list->count) &&
310 pin_list == &fifo_peek_front(&j->pin))
311 bch2_journal_reclaim_fast(j);
312 else if (fifo_used(&j->pin) == 1 &&
313 atomic_read(&pin_list->count) == 1)
317 void bch2_journal_pin_drop(struct journal *j,
318 struct journal_entry_pin *pin)
321 __journal_pin_drop(j, pin);
322 spin_unlock(&j->lock);
325 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
326 struct journal_entry_pin *pin,
327 journal_pin_flush_fn flush_fn)
329 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
331 __journal_pin_drop(j, pin);
333 BUG_ON(!atomic_read(&pin_list->count));
335 atomic_inc(&pin_list->count);
337 pin->flush = flush_fn;
339 list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
342 void __bch2_journal_pin_add(struct journal *j, u64 seq,
343 struct journal_entry_pin *pin,
344 journal_pin_flush_fn flush_fn)
347 bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
348 spin_unlock(&j->lock);
351 * If the journal is currently full, we might want to call flush_fn
357 void bch2_journal_pin_copy(struct journal *j,
358 struct journal_entry_pin *dst,
359 struct journal_entry_pin *src,
360 journal_pin_flush_fn flush_fn)
364 if (journal_pin_active(src) &&
365 (!journal_pin_active(dst) || src->seq < dst->seq))
366 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
368 spin_unlock(&j->lock);
372 * bch2_journal_pin_flush: ensure journal pin callback is no longer running
374 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
376 BUG_ON(journal_pin_active(pin));
378 wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
382 * Journal reclaim: flush references to open journal entries to reclaim space in
385 * May be done by the journal code in the background as needed to free up space
386 * for more journal entries, or as part of doing a clean shutdown, or to migrate
387 * data off of a specific device:
390 static struct journal_entry_pin *
391 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
393 struct journal_entry_pin_list *pin_list;
394 struct journal_entry_pin *ret = NULL;
398 fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
399 if (*seq > max_seq ||
400 (ret = list_first_entry_or_null(&pin_list->list,
401 struct journal_entry_pin, list)))
405 list_move(&ret->list, &pin_list->flushed);
406 BUG_ON(j->flush_in_progress);
407 j->flush_in_progress = ret;
408 j->last_flushed = jiffies;
411 spin_unlock(&j->lock);
416 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
419 struct journal_entry_pin *pin;
422 lockdep_assert_held(&j->reclaim_lock);
424 while ((pin = journal_get_next_pin(j, min_nr
425 ? U64_MAX : seq_to_flush, &seq))) {
429 pin->flush(j, pin, seq);
431 BUG_ON(j->flush_in_progress != pin);
432 j->flush_in_progress = NULL;
433 wake_up(&j->pin_flush_wait);
438 * bch2_journal_reclaim - free up journal buckets
440 * Background journal reclaim writes out btree nodes. It should be run
441 * early enough so that we never completely run out of journal buckets.
443 * High watermarks for triggering background reclaim:
444 * - FIFO has fewer than 512 entries left
445 * - fewer than 25% journal buckets free
447 * Background reclaim runs until low watermarks are reached:
448 * - FIFO has more than 1024 entries left
449 * - more than 50% journal buckets free
451 * As long as a reclaim can complete in the time it takes to fill up
452 * 512 journal entries or 25% of all journal buckets, then
453 * journal_next_bucket() should not stall.
455 void bch2_journal_reclaim(struct journal *j)
457 struct bch_fs *c = container_of(j, struct bch_fs, journal);
459 unsigned iter, min_nr = 0;
460 u64 seq_to_flush = 0;
462 lockdep_assert_held(&j->reclaim_lock);
464 bch2_journal_do_discards(j);
468 for_each_rw_member(ca, c, iter) {
469 struct journal_device *ja = &ca->journal;
470 unsigned nr_buckets, bucket_to_flush;
475 /* Try to keep the journal at most half full: */
476 nr_buckets = ja->nr / 2;
478 /* And include pre-reservations: */
479 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
480 (ca->mi.bucket_size << 6) -
481 journal_entry_overhead(j));
483 nr_buckets = min(nr_buckets, ja->nr);
485 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
486 seq_to_flush = max(seq_to_flush,
487 ja->bucket_seq[bucket_to_flush]);
490 /* Also flush if the pin fifo is more than half full */
491 seq_to_flush = max_t(s64, seq_to_flush,
492 (s64) journal_cur_seq(j) -
494 spin_unlock(&j->lock);
497 * If it's been longer than j->reclaim_delay_ms since we last flushed,
498 * make sure to flush at least one journal pin:
500 if (time_after(jiffies, j->last_flushed +
501 msecs_to_jiffies(j->reclaim_delay_ms)))
504 if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
505 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
509 journal_flush_pins(j, seq_to_flush, min_nr);
511 if (!bch2_journal_error(j))
512 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
513 msecs_to_jiffies(j->reclaim_delay_ms));
516 void bch2_journal_reclaim_work(struct work_struct *work)
518 struct journal *j = container_of(to_delayed_work(work),
519 struct journal, reclaim_work);
521 mutex_lock(&j->reclaim_lock);
522 bch2_journal_reclaim(j);
523 mutex_unlock(&j->reclaim_lock);
526 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
530 ret = bch2_journal_error(j);
534 mutex_lock(&j->reclaim_lock);
536 journal_flush_pins(j, seq_to_flush, 0);
540 * If journal replay hasn't completed, the unreplayed journal entries
541 * hold refs on their corresponding sequence numbers
543 ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
544 journal_last_seq(j) > seq_to_flush ||
545 (fifo_used(&j->pin) == 1 &&
546 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
548 spin_unlock(&j->lock);
549 mutex_unlock(&j->reclaim_lock);
554 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
556 if (!test_bit(JOURNAL_STARTED, &j->flags))
559 closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
562 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
564 struct bch_fs *c = container_of(j, struct bch_fs, journal);
565 struct journal_entry_pin_list *p;
570 fifo_for_each_entry_ptr(p, &j->pin, iter)
572 ? bch2_dev_list_has_dev(p->devs, dev_idx)
573 : p->devs.nr < c->opts.metadata_replicas)
575 spin_unlock(&j->lock);
577 bch2_journal_flush_pins(j, seq);
579 ret = bch2_journal_error(j);
583 mutex_lock(&c->replicas_gc_lock);
584 bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
589 while (!ret && seq < j->pin.back) {
590 struct bch_replicas_padded replicas;
592 seq = max(seq, journal_last_seq(j));
593 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
594 journal_seq_pin(j, seq)->devs);
597 spin_unlock(&j->lock);
598 ret = bch2_mark_replicas(c, &replicas.e);
601 spin_unlock(&j->lock);
603 ret = bch2_replicas_gc_end(c, ret);
604 mutex_unlock(&c->replicas_gc_lock);