4 #include "journal_io.h"
5 #include "journal_reclaim.h"
9 /* Free space calculations: */
11 unsigned bch2_journal_dev_buckets_available(struct journal *j,
12 struct journal_device *ja)
14 struct bch_fs *c = container_of(j, struct bch_fs, journal);
15 unsigned next = (ja->cur_idx + 1) % ja->nr;
16 unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
19 * Allocator startup needs some journal space before we can do journal
23 test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
27 * Don't use the last bucket unless writing the new last_seq
28 * will make another bucket available:
31 journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
37 void bch2_journal_space_available(struct journal *j)
39 struct bch_fs *c = container_of(j, struct bch_fs, journal);
41 unsigned sectors_next_entry = UINT_MAX;
42 unsigned sectors_total = UINT_MAX;
43 unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
44 j->buf[1].buf_size >> 9);
45 unsigned i, nr_online = 0, nr_devs = 0;
46 unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
47 ? journal_prev_buf(j)->sectors
51 lockdep_assert_held(&j->lock);
54 for_each_member_device_rcu(ca, c, i,
55 &c->rw_devs[BCH_DATA_JOURNAL]) {
56 struct journal_device *ja = &ca->journal;
57 unsigned buckets_this_device, sectors_this_device;
64 buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
65 sectors_this_device = ja->sectors_free;
68 * We that we don't allocate the space for a journal entry
69 * until we write it out - thus, account for it here:
71 if (unwritten_sectors >= sectors_this_device) {
72 if (!buckets_this_device)
75 buckets_this_device--;
76 sectors_this_device = ca->mi.bucket_size;
79 sectors_this_device -= unwritten_sectors;
81 if (sectors_this_device < ca->mi.bucket_size &&
82 buckets_this_device) {
83 buckets_this_device--;
84 sectors_this_device = ca->mi.bucket_size;
87 if (!sectors_this_device)
90 sectors_next_entry = min(sectors_next_entry,
93 sectors_total = min(sectors_total,
94 buckets_this_device * ca->mi.bucket_size +
97 max_entry_size = min_t(unsigned, max_entry_size,
104 if (nr_online < c->opts.metadata_replicas_required) {
106 sectors_next_entry = 0;
107 } else if (!sectors_next_entry ||
108 nr_devs < min_t(unsigned, nr_online,
109 c->opts.metadata_replicas)) {
111 sectors_next_entry = 0;
112 } else if (!fifo_free(&j->pin)) {
114 sectors_next_entry = 0;
117 j->cur_entry_sectors = sectors_next_entry;
118 j->cur_entry_error = ret;
124 /* Discards - last part of journal reclaim: */
126 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
132 ja->last_idx != ja->cur_idx &&
133 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
134 spin_unlock(&j->lock);
140 * Advance ja->last_idx as long as it points to buckets that are no longer
141 * dirty, issuing discards if necessary:
143 static void journal_do_discards(struct journal *j)
145 struct bch_fs *c = container_of(j, struct bch_fs, journal);
149 mutex_lock(&j->reclaim_lock);
151 for_each_rw_member(ca, c, iter) {
152 struct journal_device *ja = &ca->journal;
154 while (should_discard_bucket(j, ja)) {
155 if (ca->mi.discard &&
156 blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
157 blkdev_issue_discard(ca->disk_sb.bdev,
159 ja->buckets[ja->last_idx]),
160 ca->mi.bucket_size, GFP_NOIO, 0);
163 ja->last_idx = (ja->last_idx + 1) % ja->nr;
165 bch2_journal_space_available(j);
166 spin_unlock(&j->lock);
170 mutex_unlock(&j->reclaim_lock);
174 * Journal entry pinning - machinery for holding a reference on a given journal
175 * entry, holding it open to ensure it gets replayed during recovery:
178 static void bch2_journal_reclaim_fast(struct journal *j)
180 struct journal_entry_pin_list temp;
183 lockdep_assert_held(&j->lock);
186 * Unpin journal entries whose reference counts reached zero, meaning
187 * all btree nodes got written out
189 while (!fifo_empty(&j->pin) &&
190 !atomic_read(&fifo_peek_front(&j->pin).count)) {
191 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
192 BUG_ON(!fifo_pop(&j->pin, temp));
197 bch2_journal_space_available(j);
200 void bch2_journal_pin_put(struct journal *j, u64 seq)
202 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
204 if (atomic_dec_and_test(&pin_list->count)) {
206 bch2_journal_reclaim_fast(j);
207 spin_unlock(&j->lock);
211 static inline void __journal_pin_add(struct journal *j,
213 struct journal_entry_pin *pin,
214 journal_pin_flush_fn flush_fn)
216 struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
218 BUG_ON(journal_pin_active(pin));
219 BUG_ON(!atomic_read(&pin_list->count));
221 atomic_inc(&pin_list->count);
223 pin->flush = flush_fn;
225 list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
228 * If the journal is currently full, we might want to call flush_fn
234 void bch2_journal_pin_add(struct journal *j, u64 seq,
235 struct journal_entry_pin *pin,
236 journal_pin_flush_fn flush_fn)
239 __journal_pin_add(j, seq, pin, flush_fn);
240 spin_unlock(&j->lock);
243 static inline void __journal_pin_drop(struct journal *j,
244 struct journal_entry_pin *pin)
246 struct journal_entry_pin_list *pin_list;
248 if (!journal_pin_active(pin))
251 pin_list = journal_seq_pin(j, pin->seq);
253 list_del_init(&pin->list);
256 * Unpinning a journal entry make make journal_next_bucket() succeed, if
257 * writing a new last_seq will now make another bucket available:
259 if (atomic_dec_and_test(&pin_list->count) &&
260 pin_list == &fifo_peek_front(&j->pin))
261 bch2_journal_reclaim_fast(j);
262 else if (fifo_used(&j->pin) == 1 &&
263 atomic_read(&pin_list->count) == 1)
267 void bch2_journal_pin_drop(struct journal *j,
268 struct journal_entry_pin *pin)
271 __journal_pin_drop(j, pin);
272 spin_unlock(&j->lock);
275 void bch2_journal_pin_update(struct journal *j, u64 seq,
276 struct journal_entry_pin *pin,
277 journal_pin_flush_fn flush_fn)
281 if (pin->seq != seq) {
282 __journal_pin_drop(j, pin);
283 __journal_pin_add(j, seq, pin, flush_fn);
285 struct journal_entry_pin_list *pin_list =
286 journal_seq_pin(j, seq);
288 list_move(&pin->list, &pin_list->list);
291 spin_unlock(&j->lock);
294 void bch2_journal_pin_add_if_older(struct journal *j,
295 struct journal_entry_pin *src_pin,
296 struct journal_entry_pin *pin,
297 journal_pin_flush_fn flush_fn)
301 if (journal_pin_active(src_pin) &&
302 (!journal_pin_active(pin) ||
303 src_pin->seq < pin->seq)) {
304 __journal_pin_drop(j, pin);
305 __journal_pin_add(j, src_pin->seq, pin, flush_fn);
308 spin_unlock(&j->lock);
311 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
313 BUG_ON(journal_pin_active(pin));
315 wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
319 * Journal reclaim: flush references to open journal entries to reclaim space in
322 * May be done by the journal code in the background as needed to free up space
323 * for more journal entries, or as part of doing a clean shutdown, or to migrate
324 * data off of a specific device:
327 static struct journal_entry_pin *
328 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
330 struct journal_entry_pin_list *pin_list;
331 struct journal_entry_pin *ret = NULL;
335 fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
336 if (*seq > max_seq ||
337 (ret = list_first_entry_or_null(&pin_list->list,
338 struct journal_entry_pin, list)))
342 list_move(&ret->list, &pin_list->flushed);
343 BUG_ON(j->flush_in_progress);
344 j->flush_in_progress = ret;
345 j->last_flushed = jiffies;
348 spin_unlock(&j->lock);
353 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
356 struct journal_entry_pin *pin;
359 lockdep_assert_held(&j->reclaim_lock);
361 while ((pin = journal_get_next_pin(j, min_nr
362 ? U64_MAX : seq_to_flush, &seq))) {
366 pin->flush(j, pin, seq);
368 BUG_ON(j->flush_in_progress != pin);
369 j->flush_in_progress = NULL;
370 wake_up(&j->pin_flush_wait);
375 * bch2_journal_reclaim_work - free up journal buckets
377 * Background journal reclaim writes out btree nodes. It should be run
378 * early enough so that we never completely run out of journal buckets.
380 * High watermarks for triggering background reclaim:
381 * - FIFO has fewer than 512 entries left
382 * - fewer than 25% journal buckets free
384 * Background reclaim runs until low watermarks are reached:
385 * - FIFO has more than 1024 entries left
386 * - more than 50% journal buckets free
388 * As long as a reclaim can complete in the time it takes to fill up
389 * 512 journal entries or 25% of all journal buckets, then
390 * journal_next_bucket() should not stall.
392 void bch2_journal_reclaim_work(struct work_struct *work)
394 struct bch_fs *c = container_of(to_delayed_work(work),
395 struct bch_fs, journal.reclaim_work);
396 struct journal *j = &c->journal;
398 unsigned iter, bucket_to_flush, min_nr = 0;
399 u64 seq_to_flush = 0;
401 journal_do_discards(j);
403 mutex_lock(&j->reclaim_lock);
406 for_each_rw_member(ca, c, iter) {
407 struct journal_device *ja = &ca->journal;
413 /* Try to keep the journal at most half full: */
414 bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
415 seq_to_flush = max_t(u64, seq_to_flush,
416 ja->bucket_seq[bucket_to_flush]);
419 /* Also flush if the pin fifo is more than half full */
420 seq_to_flush = max_t(s64, seq_to_flush,
421 (s64) journal_cur_seq(j) -
423 spin_unlock(&j->lock);
426 * If it's been longer than j->reclaim_delay_ms since we last flushed,
427 * make sure to flush at least one journal pin:
429 if (time_after(jiffies, j->last_flushed +
430 msecs_to_jiffies(j->reclaim_delay_ms)))
433 journal_flush_pins(j, seq_to_flush, min_nr);
435 mutex_unlock(&j->reclaim_lock);
437 if (!test_bit(BCH_FS_RO, &c->flags))
438 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
439 msecs_to_jiffies(j->reclaim_delay_ms));
442 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
446 ret = bch2_journal_error(j);
450 mutex_lock(&j->reclaim_lock);
452 journal_flush_pins(j, seq_to_flush, 0);
456 * If journal replay hasn't completed, the unreplayed journal entries
457 * hold refs on their corresponding sequence numbers
459 ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
460 journal_last_seq(j) > seq_to_flush ||
461 (fifo_used(&j->pin) == 1 &&
462 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
464 spin_unlock(&j->lock);
465 mutex_unlock(&j->reclaim_lock);
470 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
472 if (!test_bit(JOURNAL_STARTED, &j->flags))
475 closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
478 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
480 struct bch_fs *c = container_of(j, struct bch_fs, journal);
481 struct journal_entry_pin_list *p;
486 fifo_for_each_entry_ptr(p, &j->pin, iter)
488 ? bch2_dev_list_has_dev(p->devs, dev_idx)
489 : p->devs.nr < c->opts.metadata_replicas)
491 spin_unlock(&j->lock);
493 bch2_journal_flush_pins(j, seq);
495 ret = bch2_journal_error(j);
499 mutex_lock(&c->replicas_gc_lock);
500 bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
505 while (!ret && seq < j->pin.back) {
506 struct bch_replicas_padded replicas;
508 seq = max(seq, journal_last_seq(j));
509 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
510 journal_seq_pin(j, seq)->devs);
513 spin_unlock(&j->lock);
514 ret = bch2_mark_replicas(c, &replicas.e);
517 spin_unlock(&j->lock);
519 ret = bch2_replicas_gc_end(c, ret);
520 mutex_unlock(&c->replicas_gc_lock);