]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c
Improved functionality for cmd_list
[bcachefs-tools-debian] / libbcachefs / journal_reclaim.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "journal.h"
5 #include "journal_io.h"
6 #include "journal_reclaim.h"
7 #include "replicas.h"
8 #include "super.h"
9
10 /* Free space calculations: */
11
12 static unsigned journal_space_from(struct journal_device *ja,
13                                    enum journal_space_from from)
14 {
15         switch (from) {
16         case journal_space_discarded:
17                 return ja->discard_idx;
18         case journal_space_clean_ondisk:
19                 return ja->dirty_idx_ondisk;
20         case journal_space_clean:
21                 return ja->dirty_idx;
22         default:
23                 BUG();
24         }
25 }
26
27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
28                                             struct journal_device *ja,
29                                             enum journal_space_from from)
30 {
31         struct bch_fs *c = container_of(j, struct bch_fs, journal);
32         unsigned available = (journal_space_from(ja, from) -
33                               ja->cur_idx - 1 + ja->nr) % ja->nr;
34
35         /*
36          * Allocator startup needs some journal space before we can do journal
37          * replay:
38          */
39         if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
40                 --available;
41
42         /*
43          * Don't use the last bucket unless writing the new last_seq
44          * will make another bucket available:
45          */
46         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
47                 --available;
48
49         return available;
50 }
51
52 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
53 {
54         union journal_preres_state old, new;
55         u64 v = atomic64_read(&j->prereserved.counter);
56
57         do {
58                 old.v = new.v = v;
59                 new.remaining = u64s_remaining;
60         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
61                                        old.v, new.v)) != old.v);
62 }
63
64 static struct journal_space {
65         unsigned        next_entry;
66         unsigned        remaining;
67 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
68                             enum journal_space_from from)
69 {
70         struct bch_fs *c = container_of(j, struct bch_fs, journal);
71         struct bch_dev *ca;
72         unsigned sectors_next_entry     = UINT_MAX;
73         unsigned sectors_total          = UINT_MAX;
74         unsigned i, nr_devs = 0;
75         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
76                 ? journal_prev_buf(j)->sectors
77                 : 0;
78
79         rcu_read_lock();
80         for_each_member_device_rcu(ca, c, i,
81                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
82                 struct journal_device *ja = &ca->journal;
83                 unsigned buckets_this_device, sectors_this_device;
84
85                 if (!ja->nr)
86                         continue;
87
88                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
89                 sectors_this_device = ja->sectors_free;
90
91                 /*
92                  * We that we don't allocate the space for a journal entry
93                  * until we write it out - thus, account for it here:
94                  */
95                 if (unwritten_sectors >= sectors_this_device) {
96                         if (!buckets_this_device)
97                                 continue;
98
99                         buckets_this_device--;
100                         sectors_this_device = ca->mi.bucket_size;
101                 }
102
103                 sectors_this_device -= unwritten_sectors;
104
105                 if (sectors_this_device < ca->mi.bucket_size &&
106                     buckets_this_device) {
107                         buckets_this_device--;
108                         sectors_this_device = ca->mi.bucket_size;
109                 }
110
111                 if (!sectors_this_device)
112                         continue;
113
114                 sectors_next_entry = min(sectors_next_entry,
115                                          sectors_this_device);
116
117                 sectors_total = min(sectors_total,
118                         buckets_this_device * ca->mi.bucket_size +
119                         sectors_this_device);
120
121                 nr_devs++;
122         }
123         rcu_read_unlock();
124
125         if (nr_devs < nr_devs_want)
126                 return (struct journal_space) { 0, 0 };
127
128         return (struct journal_space) {
129                 .next_entry     = sectors_next_entry,
130                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
131         };
132 }
133
134 void bch2_journal_space_available(struct journal *j)
135 {
136         struct bch_fs *c = container_of(j, struct bch_fs, journal);
137         struct bch_dev *ca;
138         struct journal_space discarded, clean_ondisk, clean;
139         unsigned overhead, u64s_remaining = 0;
140         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
141                                        j->buf[1].buf_size >> 9);
142         unsigned i, nr_online = 0, nr_devs_want;
143         bool can_discard = false;
144         int ret = 0;
145
146         lockdep_assert_held(&j->lock);
147
148         rcu_read_lock();
149         for_each_member_device_rcu(ca, c, i,
150                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
151                 struct journal_device *ja = &ca->journal;
152
153                 if (!ja->nr)
154                         continue;
155
156                 while (ja->dirty_idx != ja->cur_idx &&
157                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
158                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
159
160                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
161                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
162                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
163
164                 if (ja->discard_idx != ja->dirty_idx_ondisk)
165                         can_discard = true;
166
167                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
168                 nr_online++;
169         }
170         rcu_read_unlock();
171
172         j->can_discard = can_discard;
173
174         if (nr_online < c->opts.metadata_replicas_required) {
175                 ret = -EROFS;
176                 goto out;
177         }
178
179         if (!fifo_free(&j->pin)) {
180                 ret = -ENOSPC;
181                 goto out;
182         }
183
184         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
185
186         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
187         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
188         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
189
190         if (!discarded.next_entry)
191                 ret = -ENOSPC;
192
193         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
194                 journal_entry_overhead(j);
195         u64s_remaining = clean.remaining << 6;
196         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
197         u64s_remaining /= 4;
198 out:
199         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
200         j->cur_entry_error      = ret;
201         journal_set_remaining(j, u64s_remaining);
202         journal_check_may_get_unreserved(j);
203
204         if (!ret)
205                 journal_wake(j);
206 }
207
208 /* Discards - last part of journal reclaim: */
209
210 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
211 {
212         bool ret;
213
214         spin_lock(&j->lock);
215         ret = ja->discard_idx != ja->dirty_idx_ondisk;
216         spin_unlock(&j->lock);
217
218         return ret;
219 }
220
221 /*
222  * Advance ja->discard_idx as long as it points to buckets that are no longer
223  * dirty, issuing discards if necessary:
224  */
225 void bch2_journal_do_discards(struct journal *j)
226 {
227         struct bch_fs *c = container_of(j, struct bch_fs, journal);
228         struct bch_dev *ca;
229         unsigned iter;
230
231         mutex_lock(&j->discard_lock);
232
233         for_each_rw_member(ca, c, iter) {
234                 struct journal_device *ja = &ca->journal;
235
236                 while (should_discard_bucket(j, ja)) {
237                         if (ca->mi.discard &&
238                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
239                                 blkdev_issue_discard(ca->disk_sb.bdev,
240                                         bucket_to_sector(ca,
241                                                 ja->buckets[ja->discard_idx]),
242                                         ca->mi.bucket_size, GFP_NOIO, 0);
243
244                         spin_lock(&j->lock);
245                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
246
247                         bch2_journal_space_available(j);
248                         spin_unlock(&j->lock);
249                 }
250         }
251
252         mutex_unlock(&j->discard_lock);
253 }
254
255 /*
256  * Journal entry pinning - machinery for holding a reference on a given journal
257  * entry, holding it open to ensure it gets replayed during recovery:
258  */
259
260 static void bch2_journal_reclaim_fast(struct journal *j)
261 {
262         struct journal_entry_pin_list temp;
263         bool popped = false;
264
265         lockdep_assert_held(&j->lock);
266
267         /*
268          * Unpin journal entries whose reference counts reached zero, meaning
269          * all btree nodes got written out
270          */
271         while (!fifo_empty(&j->pin) &&
272                !atomic_read(&fifo_peek_front(&j->pin).count)) {
273                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
274                 BUG_ON(!fifo_pop(&j->pin, temp));
275                 popped = true;
276         }
277
278         if (popped)
279                 bch2_journal_space_available(j);
280 }
281
282 void bch2_journal_pin_put(struct journal *j, u64 seq)
283 {
284         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
285
286         if (atomic_dec_and_test(&pin_list->count)) {
287                 spin_lock(&j->lock);
288                 bch2_journal_reclaim_fast(j);
289                 spin_unlock(&j->lock);
290         }
291 }
292
293 static inline void __journal_pin_drop(struct journal *j,
294                                       struct journal_entry_pin *pin)
295 {
296         struct journal_entry_pin_list *pin_list;
297
298         if (!journal_pin_active(pin))
299                 return;
300
301         pin_list = journal_seq_pin(j, pin->seq);
302         pin->seq = 0;
303         list_del_init(&pin->list);
304
305         /*
306          * Unpinning a journal entry make make journal_next_bucket() succeed, if
307          * writing a new last_seq will now make another bucket available:
308          */
309         if (atomic_dec_and_test(&pin_list->count) &&
310             pin_list == &fifo_peek_front(&j->pin))
311                 bch2_journal_reclaim_fast(j);
312         else if (fifo_used(&j->pin) == 1 &&
313                  atomic_read(&pin_list->count) == 1)
314                 journal_wake(j);
315 }
316
317 void bch2_journal_pin_drop(struct journal *j,
318                            struct journal_entry_pin *pin)
319 {
320         spin_lock(&j->lock);
321         __journal_pin_drop(j, pin);
322         spin_unlock(&j->lock);
323 }
324
325 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
326                             struct journal_entry_pin *pin,
327                             journal_pin_flush_fn flush_fn)
328 {
329         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
330
331         __journal_pin_drop(j, pin);
332
333         BUG_ON(!atomic_read(&pin_list->count));
334
335         atomic_inc(&pin_list->count);
336         pin->seq        = seq;
337         pin->flush      = flush_fn;
338
339         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
340 }
341
342 void __bch2_journal_pin_add(struct journal *j, u64 seq,
343                             struct journal_entry_pin *pin,
344                             journal_pin_flush_fn flush_fn)
345 {
346         spin_lock(&j->lock);
347         bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
348         spin_unlock(&j->lock);
349
350         /*
351          * If the journal is currently full,  we might want to call flush_fn
352          * immediately:
353          */
354         journal_wake(j);
355 }
356
357 void bch2_journal_pin_copy(struct journal *j,
358                            struct journal_entry_pin *dst,
359                            struct journal_entry_pin *src,
360                            journal_pin_flush_fn flush_fn)
361 {
362         spin_lock(&j->lock);
363
364         if (journal_pin_active(src) &&
365             (!journal_pin_active(dst) || src->seq < dst->seq))
366                 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
367
368         spin_unlock(&j->lock);
369 }
370
371 /**
372  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
373  */
374 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
375 {
376         BUG_ON(journal_pin_active(pin));
377
378         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
379 }
380
381 /*
382  * Journal reclaim: flush references to open journal entries to reclaim space in
383  * the journal
384  *
385  * May be done by the journal code in the background as needed to free up space
386  * for more journal entries, or as part of doing a clean shutdown, or to migrate
387  * data off of a specific device:
388  */
389
390 static struct journal_entry_pin *
391 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
392 {
393         struct journal_entry_pin_list *pin_list;
394         struct journal_entry_pin *ret = NULL;
395
396         spin_lock(&j->lock);
397
398         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
399                 if (*seq > max_seq ||
400                     (ret = list_first_entry_or_null(&pin_list->list,
401                                 struct journal_entry_pin, list)))
402                         break;
403
404         if (ret) {
405                 list_move(&ret->list, &pin_list->flushed);
406                 BUG_ON(j->flush_in_progress);
407                 j->flush_in_progress = ret;
408                 j->last_flushed = jiffies;
409         }
410
411         spin_unlock(&j->lock);
412
413         return ret;
414 }
415
416 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
417                                unsigned min_nr)
418 {
419         struct journal_entry_pin *pin;
420         u64 seq;
421
422         lockdep_assert_held(&j->reclaim_lock);
423
424         while ((pin = journal_get_next_pin(j, min_nr
425                                 ? U64_MAX : seq_to_flush, &seq))) {
426                 if (min_nr)
427                         min_nr--;
428
429                 pin->flush(j, pin, seq);
430
431                 BUG_ON(j->flush_in_progress != pin);
432                 j->flush_in_progress = NULL;
433                 wake_up(&j->pin_flush_wait);
434         }
435 }
436
437 /**
438  * bch2_journal_reclaim - free up journal buckets
439  *
440  * Background journal reclaim writes out btree nodes. It should be run
441  * early enough so that we never completely run out of journal buckets.
442  *
443  * High watermarks for triggering background reclaim:
444  * - FIFO has fewer than 512 entries left
445  * - fewer than 25% journal buckets free
446  *
447  * Background reclaim runs until low watermarks are reached:
448  * - FIFO has more than 1024 entries left
449  * - more than 50% journal buckets free
450  *
451  * As long as a reclaim can complete in the time it takes to fill up
452  * 512 journal entries or 25% of all journal buckets, then
453  * journal_next_bucket() should not stall.
454  */
455 void bch2_journal_reclaim(struct journal *j)
456 {
457         struct bch_fs *c = container_of(j, struct bch_fs, journal);
458         struct bch_dev *ca;
459         unsigned iter, min_nr = 0;
460         u64 seq_to_flush = 0;
461
462         lockdep_assert_held(&j->reclaim_lock);
463
464         bch2_journal_do_discards(j);
465
466         spin_lock(&j->lock);
467
468         for_each_rw_member(ca, c, iter) {
469                 struct journal_device *ja = &ca->journal;
470                 unsigned nr_buckets, bucket_to_flush;
471
472                 if (!ja->nr)
473                         continue;
474
475                 /* Try to keep the journal at most half full: */
476                 nr_buckets = ja->nr / 2;
477
478                 /* And include pre-reservations: */
479                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
480                                            (ca->mi.bucket_size << 6) -
481                                            journal_entry_overhead(j));
482
483                 nr_buckets = min(nr_buckets, ja->nr);
484
485                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
486                 seq_to_flush = max(seq_to_flush,
487                                    ja->bucket_seq[bucket_to_flush]);
488         }
489
490         /* Also flush if the pin fifo is more than half full */
491         seq_to_flush = max_t(s64, seq_to_flush,
492                              (s64) journal_cur_seq(j) -
493                              (j->pin.size >> 1));
494         spin_unlock(&j->lock);
495
496         /*
497          * If it's been longer than j->reclaim_delay_ms since we last flushed,
498          * make sure to flush at least one journal pin:
499          */
500         if (time_after(jiffies, j->last_flushed +
501                        msecs_to_jiffies(j->reclaim_delay_ms)))
502                 min_nr = 1;
503
504         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
505                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
506                 min_nr = 1;
507         }
508
509         journal_flush_pins(j, seq_to_flush, min_nr);
510
511         if (!bch2_journal_error(j))
512                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
513                                    msecs_to_jiffies(j->reclaim_delay_ms));
514 }
515
516 void bch2_journal_reclaim_work(struct work_struct *work)
517 {
518         struct journal *j = container_of(to_delayed_work(work),
519                                 struct journal, reclaim_work);
520
521         mutex_lock(&j->reclaim_lock);
522         bch2_journal_reclaim(j);
523         mutex_unlock(&j->reclaim_lock);
524 }
525
526 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
527 {
528         int ret;
529
530         ret = bch2_journal_error(j);
531         if (ret)
532                 return ret;
533
534         mutex_lock(&j->reclaim_lock);
535
536         journal_flush_pins(j, seq_to_flush, 0);
537
538         spin_lock(&j->lock);
539         /*
540          * If journal replay hasn't completed, the unreplayed journal entries
541          * hold refs on their corresponding sequence numbers
542          */
543         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
544                 journal_last_seq(j) > seq_to_flush ||
545                 (fifo_used(&j->pin) == 1 &&
546                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
547
548         spin_unlock(&j->lock);
549         mutex_unlock(&j->reclaim_lock);
550
551         return ret;
552 }
553
554 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
555 {
556         if (!test_bit(JOURNAL_STARTED, &j->flags))
557                 return;
558
559         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
560 }
561
562 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
563 {
564         struct bch_fs *c = container_of(j, struct bch_fs, journal);
565         struct journal_entry_pin_list *p;
566         u64 iter, seq = 0;
567         int ret = 0;
568
569         spin_lock(&j->lock);
570         fifo_for_each_entry_ptr(p, &j->pin, iter)
571                 if (dev_idx >= 0
572                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
573                     : p->devs.nr < c->opts.metadata_replicas)
574                         seq = iter;
575         spin_unlock(&j->lock);
576
577         bch2_journal_flush_pins(j, seq);
578
579         ret = bch2_journal_error(j);
580         if (ret)
581                 return ret;
582
583         mutex_lock(&c->replicas_gc_lock);
584         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
585
586         seq = 0;
587
588         spin_lock(&j->lock);
589         while (!ret && seq < j->pin.back) {
590                 struct bch_replicas_padded replicas;
591
592                 seq = max(seq, journal_last_seq(j));
593                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
594                                          journal_seq_pin(j, seq)->devs);
595                 seq++;
596
597                 spin_unlock(&j->lock);
598                 ret = bch2_mark_replicas(c, &replicas.e);
599                 spin_lock(&j->lock);
600         }
601         spin_unlock(&j->lock);
602
603         ret = bch2_replicas_gc_end(c, ret);
604         mutex_unlock(&c->replicas_gc_lock);
605
606         return ret;
607 }