]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c
Update bcachefs sources to c8aa081de3 bcachefs: increase BTREE_ITER_MAX
[bcachefs-tools-debian] / libbcachefs / journal_reclaim.c
1
2 #include "bcachefs.h"
3 #include "journal.h"
4 #include "journal_io.h"
5 #include "journal_reclaim.h"
6 #include "replicas.h"
7 #include "super.h"
8
9 /* Free space calculations: */
10
11 static unsigned journal_space_from(struct journal_device *ja,
12                                    enum journal_space_from from)
13 {
14         switch (from) {
15         case journal_space_discarded:
16                 return ja->discard_idx;
17         case journal_space_clean_ondisk:
18                 return ja->dirty_idx_ondisk;
19         case journal_space_clean:
20                 return ja->dirty_idx;
21         default:
22                 BUG();
23         }
24 }
25
26 unsigned bch2_journal_dev_buckets_available(struct journal *j,
27                                             struct journal_device *ja,
28                                             enum journal_space_from from)
29 {
30         struct bch_fs *c = container_of(j, struct bch_fs, journal);
31         unsigned available = (journal_space_from(ja, from) -
32                               ja->cur_idx - 1 + ja->nr) % ja->nr;
33
34         /*
35          * Allocator startup needs some journal space before we can do journal
36          * replay:
37          */
38         if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
39                 --available;
40
41         /*
42          * Don't use the last bucket unless writing the new last_seq
43          * will make another bucket available:
44          */
45         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
46                 --available;
47
48         return available;
49 }
50
51 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
52 {
53         union journal_preres_state old, new;
54         u64 v = atomic64_read(&j->prereserved.counter);
55
56         do {
57                 old.v = new.v = v;
58                 new.remaining = u64s_remaining;
59         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
60                                        old.v, new.v)) != old.v);
61 }
62
63 static struct journal_space {
64         unsigned        next_entry;
65         unsigned        remaining;
66 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
67                             enum journal_space_from from)
68 {
69         struct bch_fs *c = container_of(j, struct bch_fs, journal);
70         struct bch_dev *ca;
71         unsigned sectors_next_entry     = UINT_MAX;
72         unsigned sectors_total          = UINT_MAX;
73         unsigned i, nr_devs = 0;
74         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
75                 ? journal_prev_buf(j)->sectors
76                 : 0;
77
78         rcu_read_lock();
79         for_each_member_device_rcu(ca, c, i,
80                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
81                 struct journal_device *ja = &ca->journal;
82                 unsigned buckets_this_device, sectors_this_device;
83
84                 if (!ja->nr)
85                         continue;
86
87                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
88                 sectors_this_device = ja->sectors_free;
89
90                 /*
91                  * We that we don't allocate the space for a journal entry
92                  * until we write it out - thus, account for it here:
93                  */
94                 if (unwritten_sectors >= sectors_this_device) {
95                         if (!buckets_this_device)
96                                 continue;
97
98                         buckets_this_device--;
99                         sectors_this_device = ca->mi.bucket_size;
100                 }
101
102                 sectors_this_device -= unwritten_sectors;
103
104                 if (sectors_this_device < ca->mi.bucket_size &&
105                     buckets_this_device) {
106                         buckets_this_device--;
107                         sectors_this_device = ca->mi.bucket_size;
108                 }
109
110                 if (!sectors_this_device)
111                         continue;
112
113                 sectors_next_entry = min(sectors_next_entry,
114                                          sectors_this_device);
115
116                 sectors_total = min(sectors_total,
117                         buckets_this_device * ca->mi.bucket_size +
118                         sectors_this_device);
119
120                 nr_devs++;
121         }
122         rcu_read_unlock();
123
124         if (nr_devs < nr_devs_want)
125                 return (struct journal_space) { 0, 0 };
126
127         return (struct journal_space) {
128                 .next_entry     = sectors_next_entry,
129                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
130         };
131 }
132
133 void bch2_journal_space_available(struct journal *j)
134 {
135         struct bch_fs *c = container_of(j, struct bch_fs, journal);
136         struct bch_dev *ca;
137         struct journal_space discarded, clean_ondisk, clean;
138         unsigned overhead, u64s_remaining = 0;
139         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
140                                        j->buf[1].buf_size >> 9);
141         unsigned i, nr_online = 0, nr_devs_want;
142         bool can_discard = false;
143         int ret = 0;
144
145         lockdep_assert_held(&j->lock);
146
147         rcu_read_lock();
148         for_each_member_device_rcu(ca, c, i,
149                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
150                 struct journal_device *ja = &ca->journal;
151
152                 if (!ja->nr)
153                         continue;
154
155                 while (ja->dirty_idx != ja->cur_idx &&
156                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
157                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
158
159                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
160                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
161                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
162
163                 if (ja->discard_idx != ja->dirty_idx_ondisk)
164                         can_discard = true;
165
166                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
167                 nr_online++;
168         }
169         rcu_read_unlock();
170
171         j->can_discard = can_discard;
172
173         if (nr_online < c->opts.metadata_replicas_required) {
174                 ret = -EROFS;
175                 goto out;
176         }
177
178         if (!fifo_free(&j->pin)) {
179                 ret = -ENOSPC;
180                 goto out;
181         }
182
183         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
184
185         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
186         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
187         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
188
189         if (!discarded.next_entry)
190                 ret = -ENOSPC;
191
192         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
193                 journal_entry_overhead(j);
194         u64s_remaining = clean.remaining << 6;
195         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
196         u64s_remaining /= 4;
197 out:
198         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
199         j->cur_entry_error      = ret;
200         journal_set_remaining(j, u64s_remaining);
201         journal_check_may_get_unreserved(j);
202
203         if (!ret)
204                 journal_wake(j);
205 }
206
207 /* Discards - last part of journal reclaim: */
208
209 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
210 {
211         bool ret;
212
213         spin_lock(&j->lock);
214         ret = ja->discard_idx != ja->dirty_idx_ondisk;
215         spin_unlock(&j->lock);
216
217         return ret;
218 }
219
220 /*
221  * Advance ja->discard_idx as long as it points to buckets that are no longer
222  * dirty, issuing discards if necessary:
223  */
224 void bch2_journal_do_discards(struct journal *j)
225 {
226         struct bch_fs *c = container_of(j, struct bch_fs, journal);
227         struct bch_dev *ca;
228         unsigned iter;
229
230         mutex_lock(&j->discard_lock);
231
232         for_each_rw_member(ca, c, iter) {
233                 struct journal_device *ja = &ca->journal;
234
235                 while (should_discard_bucket(j, ja)) {
236                         if (ca->mi.discard &&
237                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
238                                 blkdev_issue_discard(ca->disk_sb.bdev,
239                                         bucket_to_sector(ca,
240                                                 ja->buckets[ja->discard_idx]),
241                                         ca->mi.bucket_size, GFP_NOIO, 0);
242
243                         spin_lock(&j->lock);
244                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
245
246                         bch2_journal_space_available(j);
247                         spin_unlock(&j->lock);
248                 }
249         }
250
251         mutex_unlock(&j->discard_lock);
252 }
253
254 /*
255  * Journal entry pinning - machinery for holding a reference on a given journal
256  * entry, holding it open to ensure it gets replayed during recovery:
257  */
258
259 static void bch2_journal_reclaim_fast(struct journal *j)
260 {
261         struct journal_entry_pin_list temp;
262         bool popped = false;
263
264         lockdep_assert_held(&j->lock);
265
266         /*
267          * Unpin journal entries whose reference counts reached zero, meaning
268          * all btree nodes got written out
269          */
270         while (!fifo_empty(&j->pin) &&
271                !atomic_read(&fifo_peek_front(&j->pin).count)) {
272                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
273                 BUG_ON(!fifo_pop(&j->pin, temp));
274                 popped = true;
275         }
276
277         if (popped)
278                 bch2_journal_space_available(j);
279 }
280
281 void bch2_journal_pin_put(struct journal *j, u64 seq)
282 {
283         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
284
285         if (atomic_dec_and_test(&pin_list->count)) {
286                 spin_lock(&j->lock);
287                 bch2_journal_reclaim_fast(j);
288                 spin_unlock(&j->lock);
289         }
290 }
291
292 static inline void __journal_pin_add(struct journal *j,
293                                      u64 seq,
294                                      struct journal_entry_pin *pin,
295                                      journal_pin_flush_fn flush_fn)
296 {
297         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
298
299         BUG_ON(journal_pin_active(pin));
300         BUG_ON(!atomic_read(&pin_list->count));
301
302         atomic_inc(&pin_list->count);
303         pin->seq        = seq;
304         pin->flush      = flush_fn;
305
306         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
307
308         /*
309          * If the journal is currently full,  we might want to call flush_fn
310          * immediately:
311          */
312         journal_wake(j);
313 }
314
315 void bch2_journal_pin_add(struct journal *j, u64 seq,
316                           struct journal_entry_pin *pin,
317                           journal_pin_flush_fn flush_fn)
318 {
319         spin_lock(&j->lock);
320         __journal_pin_add(j, seq, pin, flush_fn);
321         spin_unlock(&j->lock);
322 }
323
324 static inline void __journal_pin_drop(struct journal *j,
325                                       struct journal_entry_pin *pin)
326 {
327         struct journal_entry_pin_list *pin_list;
328
329         if (!journal_pin_active(pin))
330                 return;
331
332         pin_list = journal_seq_pin(j, pin->seq);
333         pin->seq = 0;
334         list_del_init(&pin->list);
335
336         /*
337          * Unpinning a journal entry make make journal_next_bucket() succeed, if
338          * writing a new last_seq will now make another bucket available:
339          */
340         if (atomic_dec_and_test(&pin_list->count) &&
341             pin_list == &fifo_peek_front(&j->pin))
342                 bch2_journal_reclaim_fast(j);
343         else if (fifo_used(&j->pin) == 1 &&
344                  atomic_read(&pin_list->count) == 1)
345                 journal_wake(j);
346 }
347
348 void bch2_journal_pin_drop(struct journal *j,
349                            struct journal_entry_pin *pin)
350 {
351         spin_lock(&j->lock);
352         __journal_pin_drop(j, pin);
353         spin_unlock(&j->lock);
354 }
355
356 void bch2_journal_pin_update(struct journal *j, u64 seq,
357                              struct journal_entry_pin *pin,
358                              journal_pin_flush_fn flush_fn)
359 {
360         spin_lock(&j->lock);
361
362         if (pin->seq != seq) {
363                 __journal_pin_drop(j, pin);
364                 __journal_pin_add(j, seq, pin, flush_fn);
365         } else {
366                 struct journal_entry_pin_list *pin_list =
367                         journal_seq_pin(j, seq);
368
369                 list_move(&pin->list, &pin_list->list);
370         }
371
372         spin_unlock(&j->lock);
373 }
374
375 void bch2_journal_pin_add_if_older(struct journal *j,
376                                   struct journal_entry_pin *src_pin,
377                                   struct journal_entry_pin *pin,
378                                   journal_pin_flush_fn flush_fn)
379 {
380         spin_lock(&j->lock);
381
382         if (journal_pin_active(src_pin) &&
383             (!journal_pin_active(pin) ||
384              src_pin->seq < pin->seq)) {
385                 __journal_pin_drop(j, pin);
386                 __journal_pin_add(j, src_pin->seq, pin, flush_fn);
387         }
388
389         spin_unlock(&j->lock);
390 }
391
392 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
393 {
394         BUG_ON(journal_pin_active(pin));
395
396         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
397 }
398
399 /*
400  * Journal reclaim: flush references to open journal entries to reclaim space in
401  * the journal
402  *
403  * May be done by the journal code in the background as needed to free up space
404  * for more journal entries, or as part of doing a clean shutdown, or to migrate
405  * data off of a specific device:
406  */
407
408 static struct journal_entry_pin *
409 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
410 {
411         struct journal_entry_pin_list *pin_list;
412         struct journal_entry_pin *ret = NULL;
413
414         spin_lock(&j->lock);
415
416         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
417                 if (*seq > max_seq ||
418                     (ret = list_first_entry_or_null(&pin_list->list,
419                                 struct journal_entry_pin, list)))
420                         break;
421
422         if (ret) {
423                 list_move(&ret->list, &pin_list->flushed);
424                 BUG_ON(j->flush_in_progress);
425                 j->flush_in_progress = ret;
426                 j->last_flushed = jiffies;
427         }
428
429         spin_unlock(&j->lock);
430
431         return ret;
432 }
433
434 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
435                                unsigned min_nr)
436 {
437         struct journal_entry_pin *pin;
438         u64 seq;
439
440         lockdep_assert_held(&j->reclaim_lock);
441
442         while ((pin = journal_get_next_pin(j, min_nr
443                                 ? U64_MAX : seq_to_flush, &seq))) {
444                 if (min_nr)
445                         min_nr--;
446
447                 pin->flush(j, pin, seq);
448
449                 BUG_ON(j->flush_in_progress != pin);
450                 j->flush_in_progress = NULL;
451                 wake_up(&j->pin_flush_wait);
452         }
453 }
454
455 /**
456  * bch2_journal_reclaim - free up journal buckets
457  *
458  * Background journal reclaim writes out btree nodes. It should be run
459  * early enough so that we never completely run out of journal buckets.
460  *
461  * High watermarks for triggering background reclaim:
462  * - FIFO has fewer than 512 entries left
463  * - fewer than 25% journal buckets free
464  *
465  * Background reclaim runs until low watermarks are reached:
466  * - FIFO has more than 1024 entries left
467  * - more than 50% journal buckets free
468  *
469  * As long as a reclaim can complete in the time it takes to fill up
470  * 512 journal entries or 25% of all journal buckets, then
471  * journal_next_bucket() should not stall.
472  */
473 void bch2_journal_reclaim(struct journal *j)
474 {
475         struct bch_fs *c = container_of(j, struct bch_fs, journal);
476         struct bch_dev *ca;
477         unsigned iter, min_nr = 0;
478         u64 seq_to_flush = 0;
479
480         lockdep_assert_held(&j->reclaim_lock);
481
482         bch2_journal_do_discards(j);
483
484         spin_lock(&j->lock);
485
486         for_each_rw_member(ca, c, iter) {
487                 struct journal_device *ja = &ca->journal;
488                 unsigned nr_buckets, bucket_to_flush;
489
490                 if (!ja->nr)
491                         continue;
492
493                 /* Try to keep the journal at most half full: */
494                 nr_buckets = ja->nr / 2;
495
496                 /* And include pre-reservations: */
497                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
498                                            (ca->mi.bucket_size << 6) -
499                                            journal_entry_overhead(j));
500
501                 nr_buckets = min(nr_buckets, ja->nr);
502
503                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
504                 seq_to_flush = max(seq_to_flush,
505                                    ja->bucket_seq[bucket_to_flush]);
506         }
507
508         /* Also flush if the pin fifo is more than half full */
509         seq_to_flush = max_t(s64, seq_to_flush,
510                              (s64) journal_cur_seq(j) -
511                              (j->pin.size >> 1));
512         spin_unlock(&j->lock);
513
514         /*
515          * If it's been longer than j->reclaim_delay_ms since we last flushed,
516          * make sure to flush at least one journal pin:
517          */
518         if (time_after(jiffies, j->last_flushed +
519                        msecs_to_jiffies(j->reclaim_delay_ms)))
520                 min_nr = 1;
521
522         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
523                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
524                 min_nr = 1;
525         }
526
527         journal_flush_pins(j, seq_to_flush, min_nr);
528
529         if (!bch2_journal_error(j))
530                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
531                                    msecs_to_jiffies(j->reclaim_delay_ms));
532 }
533
534 void bch2_journal_reclaim_work(struct work_struct *work)
535 {
536         struct journal *j = container_of(to_delayed_work(work),
537                                 struct journal, reclaim_work);
538
539         mutex_lock(&j->reclaim_lock);
540         bch2_journal_reclaim(j);
541         mutex_unlock(&j->reclaim_lock);
542 }
543
544 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
545 {
546         int ret;
547
548         ret = bch2_journal_error(j);
549         if (ret)
550                 return ret;
551
552         mutex_lock(&j->reclaim_lock);
553
554         journal_flush_pins(j, seq_to_flush, 0);
555
556         spin_lock(&j->lock);
557         /*
558          * If journal replay hasn't completed, the unreplayed journal entries
559          * hold refs on their corresponding sequence numbers
560          */
561         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
562                 journal_last_seq(j) > seq_to_flush ||
563                 (fifo_used(&j->pin) == 1 &&
564                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
565
566         spin_unlock(&j->lock);
567         mutex_unlock(&j->reclaim_lock);
568
569         return ret;
570 }
571
572 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
573 {
574         if (!test_bit(JOURNAL_STARTED, &j->flags))
575                 return;
576
577         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
578 }
579
580 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
581 {
582         struct bch_fs *c = container_of(j, struct bch_fs, journal);
583         struct journal_entry_pin_list *p;
584         u64 iter, seq = 0;
585         int ret = 0;
586
587         spin_lock(&j->lock);
588         fifo_for_each_entry_ptr(p, &j->pin, iter)
589                 if (dev_idx >= 0
590                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
591                     : p->devs.nr < c->opts.metadata_replicas)
592                         seq = iter;
593         spin_unlock(&j->lock);
594
595         bch2_journal_flush_pins(j, seq);
596
597         ret = bch2_journal_error(j);
598         if (ret)
599                 return ret;
600
601         mutex_lock(&c->replicas_gc_lock);
602         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
603
604         seq = 0;
605
606         spin_lock(&j->lock);
607         while (!ret && seq < j->pin.back) {
608                 struct bch_replicas_padded replicas;
609
610                 seq = max(seq, journal_last_seq(j));
611                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
612                                          journal_seq_pin(j, seq)->devs);
613                 seq++;
614
615                 spin_unlock(&j->lock);
616                 ret = bch2_mark_replicas(c, &replicas.e);
617                 spin_lock(&j->lock);
618         }
619         spin_unlock(&j->lock);
620
621         ret = bch2_replicas_gc_end(c, ret);
622         mutex_unlock(&c->replicas_gc_lock);
623
624         return ret;
625 }