]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c
Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree
[bcachefs-tools-debian] / libbcachefs / journal_reclaim.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "journal.h"
5 #include "journal_io.h"
6 #include "journal_reclaim.h"
7 #include "replicas.h"
8 #include "super.h"
9
10 /* Free space calculations: */
11
12 static unsigned journal_space_from(struct journal_device *ja,
13                                    enum journal_space_from from)
14 {
15         switch (from) {
16         case journal_space_discarded:
17                 return ja->discard_idx;
18         case journal_space_clean_ondisk:
19                 return ja->dirty_idx_ondisk;
20         case journal_space_clean:
21                 return ja->dirty_idx;
22         default:
23                 BUG();
24         }
25 }
26
27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
28                                             struct journal_device *ja,
29                                             enum journal_space_from from)
30 {
31         unsigned available = (journal_space_from(ja, from) -
32                               ja->cur_idx - 1 + ja->nr) % ja->nr;
33
34         /*
35          * Don't use the last bucket unless writing the new last_seq
36          * will make another bucket available:
37          */
38         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
39                 --available;
40
41         return available;
42 }
43
44 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
45 {
46         union journal_preres_state old, new;
47         u64 v = atomic64_read(&j->prereserved.counter);
48
49         do {
50                 old.v = new.v = v;
51                 new.remaining = u64s_remaining;
52         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
53                                        old.v, new.v)) != old.v);
54 }
55
56 static struct journal_space {
57         unsigned        next_entry;
58         unsigned        remaining;
59 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
60                             enum journal_space_from from)
61 {
62         struct bch_fs *c = container_of(j, struct bch_fs, journal);
63         struct bch_dev *ca;
64         unsigned sectors_next_entry     = UINT_MAX;
65         unsigned sectors_total          = UINT_MAX;
66         unsigned i, nr_devs = 0;
67         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
68                 ? journal_prev_buf(j)->sectors
69                 : 0;
70
71         rcu_read_lock();
72         for_each_member_device_rcu(ca, c, i,
73                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
74                 struct journal_device *ja = &ca->journal;
75                 unsigned buckets_this_device, sectors_this_device;
76
77                 if (!ja->nr)
78                         continue;
79
80                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
81                 sectors_this_device = ja->sectors_free;
82
83                 /*
84                  * We that we don't allocate the space for a journal entry
85                  * until we write it out - thus, account for it here:
86                  */
87                 if (unwritten_sectors >= sectors_this_device) {
88                         if (!buckets_this_device)
89                                 continue;
90
91                         buckets_this_device--;
92                         sectors_this_device = ca->mi.bucket_size;
93                 }
94
95                 sectors_this_device -= unwritten_sectors;
96
97                 if (sectors_this_device < ca->mi.bucket_size &&
98                     buckets_this_device) {
99                         buckets_this_device--;
100                         sectors_this_device = ca->mi.bucket_size;
101                 }
102
103                 if (!sectors_this_device)
104                         continue;
105
106                 sectors_next_entry = min(sectors_next_entry,
107                                          sectors_this_device);
108
109                 sectors_total = min(sectors_total,
110                         buckets_this_device * ca->mi.bucket_size +
111                         sectors_this_device);
112
113                 nr_devs++;
114         }
115         rcu_read_unlock();
116
117         if (nr_devs < nr_devs_want)
118                 return (struct journal_space) { 0, 0 };
119
120         return (struct journal_space) {
121                 .next_entry     = sectors_next_entry,
122                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
123         };
124 }
125
126 void bch2_journal_space_available(struct journal *j)
127 {
128         struct bch_fs *c = container_of(j, struct bch_fs, journal);
129         struct bch_dev *ca;
130         struct journal_space discarded, clean_ondisk, clean;
131         unsigned overhead, u64s_remaining = 0;
132         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
133                                        j->buf[1].buf_size >> 9);
134         unsigned i, nr_online = 0, nr_devs_want;
135         bool can_discard = false;
136         int ret = 0;
137
138         lockdep_assert_held(&j->lock);
139
140         rcu_read_lock();
141         for_each_member_device_rcu(ca, c, i,
142                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
143                 struct journal_device *ja = &ca->journal;
144
145                 if (!ja->nr)
146                         continue;
147
148                 while (ja->dirty_idx != ja->cur_idx &&
149                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
150                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
151
152                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
153                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
154                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
155
156                 if (ja->discard_idx != ja->dirty_idx_ondisk)
157                         can_discard = true;
158
159                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
160                 nr_online++;
161         }
162         rcu_read_unlock();
163
164         j->can_discard = can_discard;
165
166         if (nr_online < c->opts.metadata_replicas_required) {
167                 ret = -EROFS;
168                 goto out;
169         }
170
171         if (!fifo_free(&j->pin)) {
172                 ret = -ENOSPC;
173                 goto out;
174         }
175
176         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
177
178         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
179         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
180         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
181
182         if (!discarded.next_entry)
183                 ret = -ENOSPC;
184
185         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
186                 journal_entry_overhead(j);
187         u64s_remaining = clean.remaining << 6;
188         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
189         u64s_remaining /= 4;
190 out:
191         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
192         j->cur_entry_error      = ret;
193         journal_set_remaining(j, u64s_remaining);
194         journal_check_may_get_unreserved(j);
195
196         if (!ret)
197                 journal_wake(j);
198 }
199
200 /* Discards - last part of journal reclaim: */
201
202 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
203 {
204         bool ret;
205
206         spin_lock(&j->lock);
207         ret = ja->discard_idx != ja->dirty_idx_ondisk;
208         spin_unlock(&j->lock);
209
210         return ret;
211 }
212
213 /*
214  * Advance ja->discard_idx as long as it points to buckets that are no longer
215  * dirty, issuing discards if necessary:
216  */
217 void bch2_journal_do_discards(struct journal *j)
218 {
219         struct bch_fs *c = container_of(j, struct bch_fs, journal);
220         struct bch_dev *ca;
221         unsigned iter;
222
223         mutex_lock(&j->discard_lock);
224
225         for_each_rw_member(ca, c, iter) {
226                 struct journal_device *ja = &ca->journal;
227
228                 while (should_discard_bucket(j, ja)) {
229                         if (ca->mi.discard &&
230                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
231                                 blkdev_issue_discard(ca->disk_sb.bdev,
232                                         bucket_to_sector(ca,
233                                                 ja->buckets[ja->discard_idx]),
234                                         ca->mi.bucket_size, GFP_NOIO, 0);
235
236                         spin_lock(&j->lock);
237                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
238
239                         bch2_journal_space_available(j);
240                         spin_unlock(&j->lock);
241                 }
242         }
243
244         mutex_unlock(&j->discard_lock);
245 }
246
247 /*
248  * Journal entry pinning - machinery for holding a reference on a given journal
249  * entry, holding it open to ensure it gets replayed during recovery:
250  */
251
252 static void bch2_journal_reclaim_fast(struct journal *j)
253 {
254         struct journal_entry_pin_list temp;
255         bool popped = false;
256
257         lockdep_assert_held(&j->lock);
258
259         /*
260          * Unpin journal entries whose reference counts reached zero, meaning
261          * all btree nodes got written out
262          */
263         while (!fifo_empty(&j->pin) &&
264                !atomic_read(&fifo_peek_front(&j->pin).count)) {
265                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
266                 BUG_ON(!fifo_pop(&j->pin, temp));
267                 popped = true;
268         }
269
270         if (popped)
271                 bch2_journal_space_available(j);
272 }
273
274 void bch2_journal_pin_put(struct journal *j, u64 seq)
275 {
276         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
277
278         if (atomic_dec_and_test(&pin_list->count)) {
279                 spin_lock(&j->lock);
280                 bch2_journal_reclaim_fast(j);
281                 spin_unlock(&j->lock);
282         }
283 }
284
285 static inline void __journal_pin_drop(struct journal *j,
286                                       struct journal_entry_pin *pin)
287 {
288         struct journal_entry_pin_list *pin_list;
289
290         if (!journal_pin_active(pin))
291                 return;
292
293         pin_list = journal_seq_pin(j, pin->seq);
294         pin->seq = 0;
295         list_del_init(&pin->list);
296
297         /*
298          * Unpinning a journal entry make make journal_next_bucket() succeed, if
299          * writing a new last_seq will now make another bucket available:
300          */
301         if (atomic_dec_and_test(&pin_list->count) &&
302             pin_list == &fifo_peek_front(&j->pin))
303                 bch2_journal_reclaim_fast(j);
304         else if (fifo_used(&j->pin) == 1 &&
305                  atomic_read(&pin_list->count) == 1)
306                 journal_wake(j);
307 }
308
309 void bch2_journal_pin_drop(struct journal *j,
310                            struct journal_entry_pin *pin)
311 {
312         spin_lock(&j->lock);
313         __journal_pin_drop(j, pin);
314         spin_unlock(&j->lock);
315 }
316
317 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
318                             struct journal_entry_pin *pin,
319                             journal_pin_flush_fn flush_fn)
320 {
321         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
322
323         __journal_pin_drop(j, pin);
324
325         BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
326
327         atomic_inc(&pin_list->count);
328         pin->seq        = seq;
329         pin->flush      = flush_fn;
330
331         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
332 }
333
334 void __bch2_journal_pin_add(struct journal *j, u64 seq,
335                             struct journal_entry_pin *pin,
336                             journal_pin_flush_fn flush_fn)
337 {
338         spin_lock(&j->lock);
339         bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
340         spin_unlock(&j->lock);
341
342         /*
343          * If the journal is currently full,  we might want to call flush_fn
344          * immediately:
345          */
346         journal_wake(j);
347 }
348
349 void bch2_journal_pin_update(struct journal *j, u64 seq,
350                              struct journal_entry_pin *pin,
351                              journal_pin_flush_fn flush_fn)
352 {
353         if (journal_pin_active(pin) && pin->seq < seq)
354                 return;
355
356         spin_lock(&j->lock);
357
358         if (pin->seq != seq) {
359                 bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
360         } else {
361                 struct journal_entry_pin_list *pin_list =
362                         journal_seq_pin(j, seq);
363
364                 /*
365                  * If the pin is already pinning the right sequence number, it
366                  * still might've already been flushed:
367                  */
368                 list_move(&pin->list, &pin_list->list);
369         }
370
371         spin_unlock(&j->lock);
372
373         /*
374          * If the journal is currently full,  we might want to call flush_fn
375          * immediately:
376          */
377         journal_wake(j);
378 }
379
380 void bch2_journal_pin_copy(struct journal *j,
381                            struct journal_entry_pin *dst,
382                            struct journal_entry_pin *src,
383                            journal_pin_flush_fn flush_fn)
384 {
385         spin_lock(&j->lock);
386
387         if (journal_pin_active(src) &&
388             (!journal_pin_active(dst) || src->seq < dst->seq))
389                 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
390
391         spin_unlock(&j->lock);
392 }
393
394 /**
395  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
396  */
397 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
398 {
399         BUG_ON(journal_pin_active(pin));
400
401         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
402 }
403
404 /*
405  * Journal reclaim: flush references to open journal entries to reclaim space in
406  * the journal
407  *
408  * May be done by the journal code in the background as needed to free up space
409  * for more journal entries, or as part of doing a clean shutdown, or to migrate
410  * data off of a specific device:
411  */
412
413 static struct journal_entry_pin *
414 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
415 {
416         struct journal_entry_pin_list *pin_list;
417         struct journal_entry_pin *ret = NULL;
418
419         if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
420                 return NULL;
421
422         spin_lock(&j->lock);
423
424         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
425                 if (*seq > max_seq ||
426                     (ret = list_first_entry_or_null(&pin_list->list,
427                                 struct journal_entry_pin, list)))
428                         break;
429
430         if (ret) {
431                 list_move(&ret->list, &pin_list->flushed);
432                 BUG_ON(j->flush_in_progress);
433                 j->flush_in_progress = ret;
434                 j->last_flushed = jiffies;
435         }
436
437         spin_unlock(&j->lock);
438
439         return ret;
440 }
441
442 /* returns true if we did work */
443 static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
444                                unsigned min_nr)
445 {
446         struct journal_entry_pin *pin;
447         bool ret = false;
448         u64 seq;
449
450         lockdep_assert_held(&j->reclaim_lock);
451
452         while ((pin = journal_get_next_pin(j, min_nr
453                                 ? U64_MAX : seq_to_flush, &seq))) {
454                 if (min_nr)
455                         min_nr--;
456
457                 pin->flush(j, pin, seq);
458
459                 BUG_ON(j->flush_in_progress != pin);
460                 j->flush_in_progress = NULL;
461                 wake_up(&j->pin_flush_wait);
462                 ret = true;
463         }
464
465         return ret;
466 }
467
468 /**
469  * bch2_journal_reclaim - free up journal buckets
470  *
471  * Background journal reclaim writes out btree nodes. It should be run
472  * early enough so that we never completely run out of journal buckets.
473  *
474  * High watermarks for triggering background reclaim:
475  * - FIFO has fewer than 512 entries left
476  * - fewer than 25% journal buckets free
477  *
478  * Background reclaim runs until low watermarks are reached:
479  * - FIFO has more than 1024 entries left
480  * - more than 50% journal buckets free
481  *
482  * As long as a reclaim can complete in the time it takes to fill up
483  * 512 journal entries or 25% of all journal buckets, then
484  * journal_next_bucket() should not stall.
485  */
486 void bch2_journal_reclaim(struct journal *j)
487 {
488         struct bch_fs *c = container_of(j, struct bch_fs, journal);
489         struct bch_dev *ca;
490         unsigned iter, min_nr = 0;
491         u64 seq_to_flush = 0;
492
493         lockdep_assert_held(&j->reclaim_lock);
494
495         bch2_journal_do_discards(j);
496
497         spin_lock(&j->lock);
498
499         for_each_rw_member(ca, c, iter) {
500                 struct journal_device *ja = &ca->journal;
501                 unsigned nr_buckets, bucket_to_flush;
502
503                 if (!ja->nr)
504                         continue;
505
506                 /* Try to keep the journal at most half full: */
507                 nr_buckets = ja->nr / 2;
508
509                 /* And include pre-reservations: */
510                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
511                                            (ca->mi.bucket_size << 6) -
512                                            journal_entry_overhead(j));
513
514                 nr_buckets = min(nr_buckets, ja->nr);
515
516                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
517                 seq_to_flush = max(seq_to_flush,
518                                    ja->bucket_seq[bucket_to_flush]);
519         }
520
521         /* Also flush if the pin fifo is more than half full */
522         seq_to_flush = max_t(s64, seq_to_flush,
523                              (s64) journal_cur_seq(j) -
524                              (j->pin.size >> 1));
525         spin_unlock(&j->lock);
526
527         /*
528          * If it's been longer than j->reclaim_delay_ms since we last flushed,
529          * make sure to flush at least one journal pin:
530          */
531         if (time_after(jiffies, j->last_flushed +
532                        msecs_to_jiffies(j->reclaim_delay_ms)))
533                 min_nr = 1;
534
535         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
536                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
537                 min_nr = 1;
538         }
539
540         journal_flush_pins(j, seq_to_flush, min_nr);
541
542         if (!bch2_journal_error(j))
543                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
544                                    msecs_to_jiffies(j->reclaim_delay_ms));
545 }
546
547 void bch2_journal_reclaim_work(struct work_struct *work)
548 {
549         struct journal *j = container_of(to_delayed_work(work),
550                                 struct journal, reclaim_work);
551
552         mutex_lock(&j->reclaim_lock);
553         bch2_journal_reclaim(j);
554         mutex_unlock(&j->reclaim_lock);
555 }
556
557 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
558                               bool *did_work)
559 {
560         int ret;
561
562         ret = bch2_journal_error(j);
563         if (ret)
564                 return ret;
565
566         mutex_lock(&j->reclaim_lock);
567
568         *did_work = journal_flush_pins(j, seq_to_flush, 0);
569
570         spin_lock(&j->lock);
571         /*
572          * If journal replay hasn't completed, the unreplayed journal entries
573          * hold refs on their corresponding sequence numbers
574          */
575         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
576                 journal_last_seq(j) > seq_to_flush ||
577                 (fifo_used(&j->pin) == 1 &&
578                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
579
580         spin_unlock(&j->lock);
581         mutex_unlock(&j->reclaim_lock);
582
583         return ret;
584 }
585
586 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
587 {
588         bool did_work = false;
589
590         if (!test_bit(JOURNAL_STARTED, &j->flags))
591                 return false;
592
593         closure_wait_event(&j->async_wait,
594                 journal_flush_done(j, seq_to_flush, &did_work));
595
596         return did_work;
597 }
598
599 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
600 {
601         struct bch_fs *c = container_of(j, struct bch_fs, journal);
602         struct journal_entry_pin_list *p;
603         u64 iter, seq = 0;
604         int ret = 0;
605
606         spin_lock(&j->lock);
607         fifo_for_each_entry_ptr(p, &j->pin, iter)
608                 if (dev_idx >= 0
609                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
610                     : p->devs.nr < c->opts.metadata_replicas)
611                         seq = iter;
612         spin_unlock(&j->lock);
613
614         bch2_journal_flush_pins(j, seq);
615
616         ret = bch2_journal_error(j);
617         if (ret)
618                 return ret;
619
620         mutex_lock(&c->replicas_gc_lock);
621         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
622
623         seq = 0;
624
625         spin_lock(&j->lock);
626         while (!ret && seq < j->pin.back) {
627                 struct bch_replicas_padded replicas;
628
629                 seq = max(seq, journal_last_seq(j));
630                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
631                                          journal_seq_pin(j, seq)->devs);
632                 seq++;
633
634                 spin_unlock(&j->lock);
635                 ret = bch2_mark_replicas(c, &replicas.e);
636                 spin_lock(&j->lock);
637         }
638         spin_unlock(&j->lock);
639
640         ret = bch2_replicas_gc_end(c, ret);
641         mutex_unlock(&c->replicas_gc_lock);
642
643         return ret;
644 }