4 #include "journal_reclaim.h"
9 * Journal entry pinning - machinery for holding a reference on a given journal
10 * entry, holding it open to ensure it gets replayed during recovery:
13 static inline u64 journal_pin_seq(struct journal *j,
14 struct journal_entry_pin_list *pin_list)
16 return fifo_entry_idx_abs(&j->pin, pin_list);
19 u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
24 if (journal_pin_active(pin))
25 ret = journal_pin_seq(j, pin->pin_list);
26 spin_unlock(&j->lock);
31 static inline void __journal_pin_add(struct journal *j,
32 struct journal_entry_pin_list *pin_list,
33 struct journal_entry_pin *pin,
34 journal_pin_flush_fn flush_fn)
36 BUG_ON(journal_pin_active(pin));
37 BUG_ON(!atomic_read(&pin_list->count));
39 atomic_inc(&pin_list->count);
40 pin->pin_list = pin_list;
41 pin->flush = flush_fn;
44 list_add(&pin->list, &pin_list->list);
46 INIT_LIST_HEAD(&pin->list);
49 * If the journal is currently full, we might want to call flush_fn
55 void bch2_journal_pin_add(struct journal *j, u64 seq,
56 struct journal_entry_pin *pin,
57 journal_pin_flush_fn flush_fn)
60 __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
61 spin_unlock(&j->lock);
64 static inline void __journal_pin_drop(struct journal *j,
65 struct journal_entry_pin *pin)
67 struct journal_entry_pin_list *pin_list = pin->pin_list;
69 if (!journal_pin_active(pin))
73 list_del_init(&pin->list);
76 * Unpinning a journal entry make make journal_next_bucket() succeed, if
77 * writing a new last_seq will now make another bucket available:
79 if (atomic_dec_and_test(&pin_list->count) &&
80 pin_list == &fifo_peek_front(&j->pin))
81 bch2_journal_reclaim_fast(j);
84 void bch2_journal_pin_drop(struct journal *j,
85 struct journal_entry_pin *pin)
88 __journal_pin_drop(j, pin);
89 spin_unlock(&j->lock);
92 void bch2_journal_pin_add_if_older(struct journal *j,
93 struct journal_entry_pin *src_pin,
94 struct journal_entry_pin *pin,
95 journal_pin_flush_fn flush_fn)
99 if (journal_pin_active(src_pin) &&
100 (!journal_pin_active(pin) ||
101 journal_pin_seq(j, src_pin->pin_list) <
102 journal_pin_seq(j, pin->pin_list))) {
103 __journal_pin_drop(j, pin);
104 __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
107 spin_unlock(&j->lock);
111 * Journal reclaim: flush references to open journal entries to reclaim space in
114 * May be done by the journal code in the background as needed to free up space
115 * for more journal entries, or as part of doing a clean shutdown, or to migrate
116 * data off of a specific device:
120 * bch2_journal_reclaim_fast - do the fast part of journal reclaim
122 * Called from IO submission context, does not block. Cleans up after btree
123 * write completions by advancing the journal pin and each cache's last_idx,
124 * kicking off discards and background reclaim as necessary.
126 void bch2_journal_reclaim_fast(struct journal *j)
128 struct journal_entry_pin_list temp;
131 lockdep_assert_held(&j->lock);
134 * Unpin journal entries whose reference counts reached zero, meaning
135 * all btree nodes got written out
137 while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
138 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
139 BUG_ON(!fifo_pop(&j->pin, temp));
147 static struct journal_entry_pin *
148 __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
150 struct journal_entry_pin_list *pin_list;
151 struct journal_entry_pin *ret;
154 /* no need to iterate over empty fifo entries: */
155 bch2_journal_reclaim_fast(j);
157 fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
158 if (iter > seq_to_flush)
161 ret = list_first_entry_or_null(&pin_list->list,
162 struct journal_entry_pin, list);
164 /* must be list_del_init(), see bch2_journal_pin_drop() */
165 list_move(&ret->list, &pin_list->flushed);
174 static struct journal_entry_pin *
175 journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
177 struct journal_entry_pin *ret;
180 ret = __journal_get_next_pin(j, seq_to_flush, seq);
181 spin_unlock(&j->lock);
186 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
192 (ja->last_idx != ja->cur_idx &&
193 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
194 spin_unlock(&j->lock);
200 * bch2_journal_reclaim_work - free up journal buckets
202 * Background journal reclaim writes out btree nodes. It should be run
203 * early enough so that we never completely run out of journal buckets.
205 * High watermarks for triggering background reclaim:
206 * - FIFO has fewer than 512 entries left
207 * - fewer than 25% journal buckets free
209 * Background reclaim runs until low watermarks are reached:
210 * - FIFO has more than 1024 entries left
211 * - more than 50% journal buckets free
213 * As long as a reclaim can complete in the time it takes to fill up
214 * 512 journal entries or 25% of all journal buckets, then
215 * journal_next_bucket() should not stall.
217 void bch2_journal_reclaim_work(struct work_struct *work)
219 struct bch_fs *c = container_of(to_delayed_work(work),
220 struct bch_fs, journal.reclaim_work);
221 struct journal *j = &c->journal;
223 struct journal_entry_pin *pin;
224 u64 seq, seq_to_flush = 0;
225 unsigned iter, bucket_to_flush;
226 unsigned long next_flush;
227 bool reclaim_lock_held = false, need_flush;
230 * Advance last_idx to point to the oldest journal entry containing
231 * btree node updates that have not yet been written out
233 for_each_rw_member(ca, c, iter) {
234 struct journal_device *ja = &ca->journal;
239 while (should_discard_bucket(j, ja)) {
240 if (!reclaim_lock_held) {
243 * might be called from __journal_res_get()
244 * under wait_event() - have to go back to
245 * TASK_RUNNING before doing something that
246 * would block, but only if we're doing work:
248 __set_current_state(TASK_RUNNING);
250 mutex_lock(&j->reclaim_lock);
251 reclaim_lock_held = true;
252 /* recheck under reclaim_lock: */
256 if (ca->mi.discard &&
257 blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
258 blkdev_issue_discard(ca->disk_sb.bdev,
260 ja->buckets[ja->last_idx]),
261 ca->mi.bucket_size, GFP_NOIO, 0);
264 ja->last_idx = (ja->last_idx + 1) % ja->nr;
265 spin_unlock(&j->lock);
271 * Write out enough btree nodes to free up 50% journal
275 bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
276 seq_to_flush = max_t(u64, seq_to_flush,
277 ja->bucket_seq[bucket_to_flush]);
278 spin_unlock(&j->lock);
281 if (reclaim_lock_held)
282 mutex_unlock(&j->reclaim_lock);
284 /* Also flush if the pin fifo is more than half full */
286 seq_to_flush = max_t(s64, seq_to_flush,
287 (s64) journal_cur_seq(j) -
289 spin_unlock(&j->lock);
292 * If it's been longer than j->reclaim_delay_ms since we last flushed,
293 * make sure to flush at least one journal pin:
295 next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
296 need_flush = time_after(jiffies, next_flush);
298 while ((pin = journal_get_next_pin(j, need_flush
300 : seq_to_flush, &seq))) {
301 __set_current_state(TASK_RUNNING);
302 pin->flush(j, pin, seq);
305 j->last_flushed = jiffies;
308 if (!test_bit(BCH_FS_RO, &c->flags))
309 queue_delayed_work(system_freezable_wq, &j->reclaim_work,
310 msecs_to_jiffies(j->reclaim_delay_ms));
313 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
314 struct journal_entry_pin **pin,
321 ret = bch2_journal_error(j);
327 * If journal replay hasn't completed, the unreplayed journal entries
328 * hold refs on their corresponding sequence numbers
330 ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
331 !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
332 journal_last_seq(j) > seq_to_flush ||
333 (fifo_used(&j->pin) == 1 &&
334 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
335 spin_unlock(&j->lock);
340 int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
342 struct bch_fs *c = container_of(j, struct bch_fs, journal);
343 struct journal_entry_pin *pin;
347 if (!test_bit(JOURNAL_STARTED, &j->flags))
350 wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
352 /* flushing a journal pin might cause a new one to be added: */
353 pin->flush(j, pin, pin_seq);
358 flush = journal_last_seq(j) != j->last_seq_ondisk ||
359 (seq_to_flush == U64_MAX && c->btree_roots_dirty);
360 spin_unlock(&j->lock);
362 return flush ? bch2_journal_meta(j) : 0;
365 int bch2_journal_flush_all_pins(struct journal *j)
367 return bch2_journal_flush_pins(j, U64_MAX);
370 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
372 struct bch_fs *c = container_of(j, struct bch_fs, journal);
373 struct journal_entry_pin_list *p;
374 struct bch_devs_list devs;
379 fifo_for_each_entry_ptr(p, &j->pin, iter)
381 ? bch2_dev_list_has_dev(p->devs, dev_idx)
382 : p->devs.nr < c->opts.metadata_replicas)
384 spin_unlock(&j->lock);
386 ret = bch2_journal_flush_pins(j, seq);
390 mutex_lock(&c->replicas_gc_lock);
391 bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
396 while (!ret && seq < j->pin.back) {
397 seq = max(seq, journal_last_seq(j));
398 devs = journal_seq_pin(j, seq)->devs;
401 spin_unlock(&j->lock);
402 ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
405 spin_unlock(&j->lock);
407 bch2_replicas_gc_end(c, ret);
408 mutex_unlock(&c->replicas_gc_lock);