]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c
b8603a1ff1f2a9a6db14bea299527731e4ea6bb3
[bcachefs-tools-debian] / libbcachefs / journal_reclaim.c
1
2 #include "bcachefs.h"
3 #include "journal.h"
4 #include "journal_io.h"
5 #include "journal_reclaim.h"
6 #include "replicas.h"
7 #include "super.h"
8
9 /* Free space calculations: */
10
11 unsigned bch2_journal_dev_buckets_available(struct journal *j,
12                                             struct journal_device *ja)
13 {
14         struct bch_fs *c = container_of(j, struct bch_fs, journal);
15         unsigned next = (ja->cur_idx + 1) % ja->nr;
16         unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
17
18         /*
19          * Allocator startup needs some journal space before we can do journal
20          * replay:
21          */
22         if (available &&
23             test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
24                 available--;
25
26         /*
27          * Don't use the last bucket unless writing the new last_seq
28          * will make another bucket available:
29          */
30         if (available &&
31             journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
32                 --available;
33
34         return available;
35 }
36
37 void bch2_journal_space_available(struct journal *j)
38 {
39         struct bch_fs *c = container_of(j, struct bch_fs, journal);
40         struct bch_dev *ca;
41         unsigned sectors_next_entry     = UINT_MAX;
42         unsigned sectors_total          = UINT_MAX;
43         unsigned max_entry_size         = min(j->buf[0].buf_size >> 9,
44                                               j->buf[1].buf_size >> 9);
45         unsigned i, nr_online = 0, nr_devs = 0;
46         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
47                 ? journal_prev_buf(j)->sectors
48                 : 0;
49         int ret = 0;
50
51         lockdep_assert_held(&j->lock);
52
53         rcu_read_lock();
54         for_each_member_device_rcu(ca, c, i,
55                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
56                 struct journal_device *ja = &ca->journal;
57                 unsigned buckets_this_device, sectors_this_device;
58
59                 if (!ja->nr)
60                         continue;
61
62                 nr_online++;
63
64                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
65                 sectors_this_device = ja->sectors_free;
66
67                 /*
68                  * We that we don't allocate the space for a journal entry
69                  * until we write it out - thus, account for it here:
70                  */
71                 if (unwritten_sectors >= sectors_this_device) {
72                         if (!buckets_this_device)
73                                 continue;
74
75                         buckets_this_device--;
76                         sectors_this_device = ca->mi.bucket_size;
77                 }
78
79                 sectors_this_device -= unwritten_sectors;
80
81                 if (sectors_this_device < ca->mi.bucket_size &&
82                     buckets_this_device) {
83                         buckets_this_device--;
84                         sectors_this_device = ca->mi.bucket_size;
85                 }
86
87                 if (!sectors_this_device)
88                         continue;
89
90                 sectors_next_entry = min(sectors_next_entry,
91                                          sectors_this_device);
92
93                 sectors_total = min(sectors_total,
94                         buckets_this_device * ca->mi.bucket_size +
95                         sectors_this_device);
96
97                 max_entry_size = min_t(unsigned, max_entry_size,
98                                        ca->mi.bucket_size);
99
100                 nr_devs++;
101         }
102         rcu_read_unlock();
103
104         if (nr_online < c->opts.metadata_replicas_required) {
105                 ret = -EROFS;
106                 sectors_next_entry = 0;
107         } else if (!sectors_next_entry ||
108                    nr_devs < min_t(unsigned, nr_online,
109                                    c->opts.metadata_replicas)) {
110                 ret = -ENOSPC;
111                 sectors_next_entry = 0;
112         } else if (!fifo_free(&j->pin)) {
113                 ret = -ENOSPC;
114                 sectors_next_entry = 0;
115         }
116
117         j->cur_entry_sectors    = sectors_next_entry;
118         j->cur_entry_error      = ret;
119
120         if (!ret)
121                 journal_wake(j);
122 }
123
124 /* Discards - last part of journal reclaim: */
125
126 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
127 {
128         bool ret;
129
130         spin_lock(&j->lock);
131         ret = ja->nr &&
132                 ja->last_idx != ja->cur_idx &&
133                 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
134         spin_unlock(&j->lock);
135
136         return ret;
137 }
138
139 /*
140  * Advance ja->last_idx as long as it points to buckets that are no longer
141  * dirty, issuing discards if necessary:
142  */
143 static void journal_do_discards(struct journal *j)
144 {
145         struct bch_fs *c = container_of(j, struct bch_fs, journal);
146         struct bch_dev *ca;
147         unsigned iter;
148
149         mutex_lock(&j->reclaim_lock);
150
151         for_each_rw_member(ca, c, iter) {
152                 struct journal_device *ja = &ca->journal;
153
154                 while (should_discard_bucket(j, ja)) {
155                         if (ca->mi.discard &&
156                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
157                                 blkdev_issue_discard(ca->disk_sb.bdev,
158                                         bucket_to_sector(ca,
159                                                 ja->buckets[ja->last_idx]),
160                                         ca->mi.bucket_size, GFP_NOIO, 0);
161
162                         spin_lock(&j->lock);
163                         ja->last_idx = (ja->last_idx + 1) % ja->nr;
164
165                         bch2_journal_space_available(j);
166                         spin_unlock(&j->lock);
167                 }
168         }
169
170         mutex_unlock(&j->reclaim_lock);
171 }
172
173 /*
174  * Journal entry pinning - machinery for holding a reference on a given journal
175  * entry, holding it open to ensure it gets replayed during recovery:
176  */
177
178 static void bch2_journal_reclaim_fast(struct journal *j)
179 {
180         struct journal_entry_pin_list temp;
181         bool popped = false;
182
183         lockdep_assert_held(&j->lock);
184
185         /*
186          * Unpin journal entries whose reference counts reached zero, meaning
187          * all btree nodes got written out
188          */
189         while (!fifo_empty(&j->pin) &&
190                !atomic_read(&fifo_peek_front(&j->pin).count)) {
191                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
192                 BUG_ON(!fifo_pop(&j->pin, temp));
193                 popped = true;
194         }
195
196         if (popped)
197                 bch2_journal_space_available(j);
198 }
199
200 void bch2_journal_pin_put(struct journal *j, u64 seq)
201 {
202         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
203
204         if (atomic_dec_and_test(&pin_list->count)) {
205                 spin_lock(&j->lock);
206                 bch2_journal_reclaim_fast(j);
207                 spin_unlock(&j->lock);
208         }
209 }
210
211 static inline void __journal_pin_add(struct journal *j,
212                                      u64 seq,
213                                      struct journal_entry_pin *pin,
214                                      journal_pin_flush_fn flush_fn)
215 {
216         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
217
218         BUG_ON(journal_pin_active(pin));
219         BUG_ON(!atomic_read(&pin_list->count));
220
221         atomic_inc(&pin_list->count);
222         pin->seq        = seq;
223         pin->flush      = flush_fn;
224
225         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
226
227         /*
228          * If the journal is currently full,  we might want to call flush_fn
229          * immediately:
230          */
231         journal_wake(j);
232 }
233
234 void bch2_journal_pin_add(struct journal *j, u64 seq,
235                           struct journal_entry_pin *pin,
236                           journal_pin_flush_fn flush_fn)
237 {
238         spin_lock(&j->lock);
239         __journal_pin_add(j, seq, pin, flush_fn);
240         spin_unlock(&j->lock);
241 }
242
243 static inline void __journal_pin_drop(struct journal *j,
244                                       struct journal_entry_pin *pin)
245 {
246         struct journal_entry_pin_list *pin_list;
247
248         if (!journal_pin_active(pin))
249                 return;
250
251         pin_list = journal_seq_pin(j, pin->seq);
252         pin->seq = 0;
253         list_del_init(&pin->list);
254
255         /*
256          * Unpinning a journal entry make make journal_next_bucket() succeed, if
257          * writing a new last_seq will now make another bucket available:
258          */
259         if (atomic_dec_and_test(&pin_list->count) &&
260             pin_list == &fifo_peek_front(&j->pin))
261                 bch2_journal_reclaim_fast(j);
262         else if (fifo_used(&j->pin) == 1 &&
263                  atomic_read(&pin_list->count) == 1)
264                 journal_wake(j);
265 }
266
267 void bch2_journal_pin_drop(struct journal *j,
268                            struct journal_entry_pin *pin)
269 {
270         spin_lock(&j->lock);
271         __journal_pin_drop(j, pin);
272         spin_unlock(&j->lock);
273 }
274
275 void bch2_journal_pin_update(struct journal *j, u64 seq,
276                              struct journal_entry_pin *pin,
277                              journal_pin_flush_fn flush_fn)
278 {
279         spin_lock(&j->lock);
280
281         if (pin->seq != seq) {
282                 __journal_pin_drop(j, pin);
283                 __journal_pin_add(j, seq, pin, flush_fn);
284         } else {
285                 struct journal_entry_pin_list *pin_list =
286                         journal_seq_pin(j, seq);
287
288                 list_move(&pin->list, &pin_list->list);
289         }
290
291         spin_unlock(&j->lock);
292 }
293
294 void bch2_journal_pin_add_if_older(struct journal *j,
295                                   struct journal_entry_pin *src_pin,
296                                   struct journal_entry_pin *pin,
297                                   journal_pin_flush_fn flush_fn)
298 {
299         spin_lock(&j->lock);
300
301         if (journal_pin_active(src_pin) &&
302             (!journal_pin_active(pin) ||
303              src_pin->seq < pin->seq)) {
304                 __journal_pin_drop(j, pin);
305                 __journal_pin_add(j, src_pin->seq, pin, flush_fn);
306         }
307
308         spin_unlock(&j->lock);
309 }
310
311 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
312 {
313         BUG_ON(journal_pin_active(pin));
314
315         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
316 }
317
318 /*
319  * Journal reclaim: flush references to open journal entries to reclaim space in
320  * the journal
321  *
322  * May be done by the journal code in the background as needed to free up space
323  * for more journal entries, or as part of doing a clean shutdown, or to migrate
324  * data off of a specific device:
325  */
326
327 static struct journal_entry_pin *
328 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
329 {
330         struct journal_entry_pin_list *pin_list;
331         struct journal_entry_pin *ret = NULL;
332
333         spin_lock(&j->lock);
334
335         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
336                 if (*seq > max_seq ||
337                     (ret = list_first_entry_or_null(&pin_list->list,
338                                 struct journal_entry_pin, list)))
339                         break;
340
341         if (ret) {
342                 list_move(&ret->list, &pin_list->flushed);
343                 BUG_ON(j->flush_in_progress);
344                 j->flush_in_progress = ret;
345                 j->last_flushed = jiffies;
346         }
347
348         spin_unlock(&j->lock);
349
350         return ret;
351 }
352
353 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
354                                unsigned min_nr)
355 {
356         struct journal_entry_pin *pin;
357         u64 seq;
358
359         lockdep_assert_held(&j->reclaim_lock);
360
361         while ((pin = journal_get_next_pin(j, min_nr
362                                 ? U64_MAX : seq_to_flush, &seq))) {
363                 if (min_nr)
364                         min_nr--;
365
366                 pin->flush(j, pin, seq);
367
368                 BUG_ON(j->flush_in_progress != pin);
369                 j->flush_in_progress = NULL;
370                 wake_up(&j->pin_flush_wait);
371         }
372 }
373
374 /**
375  * bch2_journal_reclaim_work - free up journal buckets
376  *
377  * Background journal reclaim writes out btree nodes. It should be run
378  * early enough so that we never completely run out of journal buckets.
379  *
380  * High watermarks for triggering background reclaim:
381  * - FIFO has fewer than 512 entries left
382  * - fewer than 25% journal buckets free
383  *
384  * Background reclaim runs until low watermarks are reached:
385  * - FIFO has more than 1024 entries left
386  * - more than 50% journal buckets free
387  *
388  * As long as a reclaim can complete in the time it takes to fill up
389  * 512 journal entries or 25% of all journal buckets, then
390  * journal_next_bucket() should not stall.
391  */
392 void bch2_journal_reclaim_work(struct work_struct *work)
393 {
394         struct bch_fs *c = container_of(to_delayed_work(work),
395                                 struct bch_fs, journal.reclaim_work);
396         struct journal *j = &c->journal;
397         struct bch_dev *ca;
398         unsigned iter, bucket_to_flush, min_nr = 0;
399         u64 seq_to_flush = 0;
400
401         journal_do_discards(j);
402
403         mutex_lock(&j->reclaim_lock);
404         spin_lock(&j->lock);
405
406         for_each_rw_member(ca, c, iter) {
407                 struct journal_device *ja = &ca->journal;
408
409                 if (!ja->nr)
410                         continue;
411
412
413                 /* Try to keep the journal at most half full: */
414                 bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
415                 seq_to_flush = max_t(u64, seq_to_flush,
416                                      ja->bucket_seq[bucket_to_flush]);
417         }
418
419         /* Also flush if the pin fifo is more than half full */
420         seq_to_flush = max_t(s64, seq_to_flush,
421                              (s64) journal_cur_seq(j) -
422                              (j->pin.size >> 1));
423         spin_unlock(&j->lock);
424
425         /*
426          * If it's been longer than j->reclaim_delay_ms since we last flushed,
427          * make sure to flush at least one journal pin:
428          */
429         if (time_after(jiffies, j->last_flushed +
430                        msecs_to_jiffies(j->reclaim_delay_ms)))
431                 min_nr = 1;
432
433         journal_flush_pins(j, seq_to_flush, min_nr);
434
435         mutex_unlock(&j->reclaim_lock);
436
437         if (!test_bit(BCH_FS_RO, &c->flags))
438                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
439                                    msecs_to_jiffies(j->reclaim_delay_ms));
440 }
441
442 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
443 {
444         int ret;
445
446         ret = bch2_journal_error(j);
447         if (ret)
448                 return ret;
449
450         mutex_lock(&j->reclaim_lock);
451
452         journal_flush_pins(j, seq_to_flush, 0);
453
454         spin_lock(&j->lock);
455         /*
456          * If journal replay hasn't completed, the unreplayed journal entries
457          * hold refs on their corresponding sequence numbers
458          */
459         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
460                 journal_last_seq(j) > seq_to_flush ||
461                 (fifo_used(&j->pin) == 1 &&
462                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
463
464         spin_unlock(&j->lock);
465         mutex_unlock(&j->reclaim_lock);
466
467         return ret;
468 }
469
470 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
471 {
472         if (!test_bit(JOURNAL_STARTED, &j->flags))
473                 return;
474
475         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
476 }
477
478 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
479 {
480         struct bch_fs *c = container_of(j, struct bch_fs, journal);
481         struct journal_entry_pin_list *p;
482         u64 iter, seq = 0;
483         int ret = 0;
484
485         spin_lock(&j->lock);
486         fifo_for_each_entry_ptr(p, &j->pin, iter)
487                 if (dev_idx >= 0
488                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
489                     : p->devs.nr < c->opts.metadata_replicas)
490                         seq = iter;
491         spin_unlock(&j->lock);
492
493         bch2_journal_flush_pins(j, seq);
494
495         ret = bch2_journal_error(j);
496         if (ret)
497                 return ret;
498
499         mutex_lock(&c->replicas_gc_lock);
500         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
501
502         seq = 0;
503
504         spin_lock(&j->lock);
505         while (!ret && seq < j->pin.back) {
506                 struct bch_replicas_padded replicas;
507
508                 seq = max(seq, journal_last_seq(j));
509                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
510                                          journal_seq_pin(j, seq)->devs);
511                 seq++;
512
513                 spin_unlock(&j->lock);
514                 ret = bch2_mark_replicas(c, &replicas.e);
515                 spin_lock(&j->lock);
516         }
517         spin_unlock(&j->lock);
518
519         ret = bch2_replicas_gc_end(c, ret);
520         mutex_unlock(&c->replicas_gc_lock);
521
522         return ret;
523 }