git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "bcachefs.h"
   4 #include "journal.h"
   5 #include "journal_io.h"
   6 #include "journal_reclaim.h"
   7 #include "replicas.h"
   8 #include "super.h"
   9
  10 /* Free space calculations: */
  11
  12 static unsigned journal_space_from(struct journal_device *ja,
  13                                    enum journal_space_from from)
  14 {
  15         switch (from) {
  16         case journal_space_discarded:
  17                 return ja->discard_idx;
  18         case journal_space_clean_ondisk:
  19                 return ja->dirty_idx_ondisk;
  20         case journal_space_clean:
  21                 return ja->dirty_idx;
  22         default:
  23                 BUG();
  24         }
  25 }
  26
  27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
  28                                             struct journal_device *ja,
  29                                             enum journal_space_from from)
  30 {
  31         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  32         unsigned available = (journal_space_from(ja, from) -
  33                               ja->cur_idx - 1 + ja->nr) % ja->nr;
  34
  35         /*
  36          * Allocator startup needs some journal space before we can do journal
  37          * replay:
  38          */
  39         if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
  40                 --available;
  41
  42         /*
  43          * Don't use the last bucket unless writing the new last_seq
  44          * will make another bucket available:
  45          */
  46         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
  47                 --available;
  48
  49         return available;
  50 }
  51
  52 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
  53 {
  54         union journal_preres_state old, new;
  55         u64 v = atomic64_read(&j->prereserved.counter);
  56
  57         do {
  58                 old.v = new.v = v;
  59                 new.remaining = u64s_remaining;
  60         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
  61                                        old.v, new.v)) != old.v);
  62 }
  63
  64 static struct journal_space {
  65         unsigned        next_entry;
  66         unsigned        remaining;
  67 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
  68                             enum journal_space_from from)
  69 {
  70         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  71         struct bch_dev *ca;
  72         unsigned sectors_next_entry     = UINT_MAX;
  73         unsigned sectors_total          = UINT_MAX;
  74         unsigned i, nr_devs = 0;
  75         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
  76                 ? journal_prev_buf(j)->sectors
  77                 : 0;
  78
  79         rcu_read_lock();
  80         for_each_member_device_rcu(ca, c, i,
  81                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
  82                 struct journal_device *ja = &ca->journal;
  83                 unsigned buckets_this_device, sectors_this_device;
  84
  85                 if (!ja->nr)
  86                         continue;
  87
  88                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
  89                 sectors_this_device = ja->sectors_free;
  90
  91                 /*
  92                  * We that we don't allocate the space for a journal entry
  93                  * until we write it out - thus, account for it here:
  94                  */
  95                 if (unwritten_sectors >= sectors_this_device) {
  96                         if (!buckets_this_device)
  97                                 continue;
  98
  99                         buckets_this_device--;
 100                         sectors_this_device = ca->mi.bucket_size;
 101                 }
 102
 103                 sectors_this_device -= unwritten_sectors;
 104
 105                 if (sectors_this_device < ca->mi.bucket_size &&
 106                     buckets_this_device) {
 107                         buckets_this_device--;
 108                         sectors_this_device = ca->mi.bucket_size;
 109                 }
 110
 111                 if (!sectors_this_device)
 112                         continue;
 113
 114                 sectors_next_entry = min(sectors_next_entry,
 115                                          sectors_this_device);
 116
 117                 sectors_total = min(sectors_total,
 118                         buckets_this_device * ca->mi.bucket_size +
 119                         sectors_this_device);
 120
 121                 nr_devs++;
 122         }
 123         rcu_read_unlock();
 124
 125         if (nr_devs < nr_devs_want)
 126                 return (struct journal_space) { 0, 0 };
 127
 128         return (struct journal_space) {
 129                 .next_entry     = sectors_next_entry,
 130                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
 131         };
 132 }
 133
 134 void bch2_journal_space_available(struct journal *j)
 135 {
 136         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 137         struct bch_dev *ca;
 138         struct journal_space discarded, clean_ondisk, clean;
 139         unsigned overhead, u64s_remaining = 0;
 140         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
 141                                        j->buf[1].buf_size >> 9);
 142         unsigned i, nr_online = 0, nr_devs_want;
 143         bool can_discard = false;
 144         int ret = 0;
 145
 146         lockdep_assert_held(&j->lock);
 147
 148         rcu_read_lock();
 149         for_each_member_device_rcu(ca, c, i,
 150                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
 151                 struct journal_device *ja = &ca->journal;
 152
 153                 if (!ja->nr)
 154                         continue;
 155
 156                 while (ja->dirty_idx != ja->cur_idx &&
 157                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
 158                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 159
 160                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
 161                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 162                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 163
 164                 if (ja->discard_idx != ja->dirty_idx_ondisk)
 165                         can_discard = true;
 166
 167                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 168                 nr_online++;
 169         }
 170         rcu_read_unlock();
 171
 172         j->can_discard = can_discard;
 173
 174         if (nr_online < c->opts.metadata_replicas_required) {
 175                 ret = -EROFS;
 176                 goto out;
 177         }
 178
 179         if (!fifo_free(&j->pin)) {
 180                 ret = -ENOSPC;
 181                 goto out;
 182         }
 183
 184         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 185
 186         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
 187         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
 188         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
 189
 190         if (!discarded.next_entry)
 191                 ret = -ENOSPC;
 192
 193         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
 194                 journal_entry_overhead(j);
 195         u64s_remaining = clean.remaining << 6;
 196         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
 197         u64s_remaining /= 4;
 198 out:
 199         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
 200         j->cur_entry_error      = ret;
 201         journal_set_remaining(j, u64s_remaining);
 202         journal_check_may_get_unreserved(j);
 203
 204         if (!ret)
 205                 journal_wake(j);
 206 }
 207
 208 /* Discards - last part of journal reclaim: */
 209
 210 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 211 {
 212         bool ret;
 213
 214         spin_lock(&j->lock);
 215         ret = ja->discard_idx != ja->dirty_idx_ondisk;
 216         spin_unlock(&j->lock);
 217
 218         return ret;
 219 }
 220
 221 /*
 222  * Advance ja->discard_idx as long as it points to buckets that are no longer
 223  * dirty, issuing discards if necessary:
 224  */
 225 void bch2_journal_do_discards(struct journal *j)
 226 {
 227         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 228         struct bch_dev *ca;
 229         unsigned iter;
 230
 231         mutex_lock(&j->discard_lock);
 232
 233         for_each_rw_member(ca, c, iter) {
 234                 struct journal_device *ja = &ca->journal;
 235
 236                 while (should_discard_bucket(j, ja)) {
 237                         if (ca->mi.discard &&
 238                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 239                                 blkdev_issue_discard(ca->disk_sb.bdev,
 240                                         bucket_to_sector(ca,
 241                                                 ja->buckets[ja->discard_idx]),
 242                                         ca->mi.bucket_size, GFP_NOIO, 0);
 243
 244                         spin_lock(&j->lock);
 245                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
 246
 247                         bch2_journal_space_available(j);
 248                         spin_unlock(&j->lock);
 249                 }
 250         }
 251
 252         mutex_unlock(&j->discard_lock);
 253 }
 254
 255 /*
 256  * Journal entry pinning - machinery for holding a reference on a given journal
 257  * entry, holding it open to ensure it gets replayed during recovery:
 258  */
 259
 260 static void bch2_journal_reclaim_fast(struct journal *j)
 261 {
 262         struct journal_entry_pin_list temp;
 263         bool popped = false;
 264
 265         lockdep_assert_held(&j->lock);
 266
 267         /*
 268          * Unpin journal entries whose reference counts reached zero, meaning
 269          * all btree nodes got written out
 270          */
 271         while (!fifo_empty(&j->pin) &&
 272                !atomic_read(&fifo_peek_front(&j->pin).count)) {
 273                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 274                 BUG_ON(!fifo_pop(&j->pin, temp));
 275                 popped = true;
 276         }
 277
 278         if (popped)
 279                 bch2_journal_space_available(j);
 280 }
 281
 282 void bch2_journal_pin_put(struct journal *j, u64 seq)
 283 {
 284         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 285
 286         if (atomic_dec_and_test(&pin_list->count)) {
 287                 spin_lock(&j->lock);
 288                 bch2_journal_reclaim_fast(j);
 289                 spin_unlock(&j->lock);
 290         }
 291 }
 292
 293 static inline void __journal_pin_drop(struct journal *j,
 294                                       struct journal_entry_pin *pin)
 295 {
 296         struct journal_entry_pin_list *pin_list;
 297
 298         if (!journal_pin_active(pin))
 299                 return;
 300
 301         pin_list = journal_seq_pin(j, pin->seq);
 302         pin->seq = 0;
 303         list_del_init(&pin->list);
 304
 305         /*
 306          * Unpinning a journal entry make make journal_next_bucket() succeed, if
 307          * writing a new last_seq will now make another bucket available:
 308          */
 309         if (atomic_dec_and_test(&pin_list->count) &&
 310             pin_list == &fifo_peek_front(&j->pin))
 311                 bch2_journal_reclaim_fast(j);
 312         else if (fifo_used(&j->pin) == 1 &&
 313                  atomic_read(&pin_list->count) == 1)
 314                 journal_wake(j);
 315 }
 316
 317 void bch2_journal_pin_drop(struct journal *j,
 318                            struct journal_entry_pin *pin)
 319 {
 320         spin_lock(&j->lock);
 321         __journal_pin_drop(j, pin);
 322         spin_unlock(&j->lock);
 323 }
 324
 325 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 326                             struct journal_entry_pin *pin,
 327                             journal_pin_flush_fn flush_fn)
 328 {
 329         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 330
 331         __journal_pin_drop(j, pin);
 332
 333         BUG_ON(!atomic_read(&pin_list->count));
 334
 335         atomic_inc(&pin_list->count);
 336         pin->seq        = seq;
 337         pin->flush      = flush_fn;
 338
 339         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 340 }
 341
 342 void __bch2_journal_pin_add(struct journal *j, u64 seq,
 343                             struct journal_entry_pin *pin,
 344                             journal_pin_flush_fn flush_fn)
 345 {
 346         spin_lock(&j->lock);
 347         bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
 348         spin_unlock(&j->lock);
 349
 350         /*
 351          * If the journal is currently full,  we might want to call flush_fn
 352          * immediately:
 353          */
 354         journal_wake(j);
 355 }
 356
 357 void bch2_journal_pin_copy(struct journal *j,
 358                            struct journal_entry_pin *dst,
 359                            struct journal_entry_pin *src,
 360                            journal_pin_flush_fn flush_fn)
 361 {
 362         spin_lock(&j->lock);
 363
 364         if (journal_pin_active(src) &&
 365             (!journal_pin_active(dst) || src->seq < dst->seq))
 366                 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
 367
 368         spin_unlock(&j->lock);
 369 }
 370
 371 /**
 372  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
 373  */
 374 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 375 {
 376         BUG_ON(journal_pin_active(pin));
 377
 378         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
 379 }
 380
 381 /*
 382  * Journal reclaim: flush references to open journal entries to reclaim space in
 383  * the journal
 384  *
 385  * May be done by the journal code in the background as needed to free up space
 386  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 387  * data off of a specific device:
 388  */
 389
 390 static struct journal_entry_pin *
 391 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 392 {
 393         struct journal_entry_pin_list *pin_list;
 394         struct journal_entry_pin *ret = NULL;
 395
 396         spin_lock(&j->lock);
 397
 398         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 399                 if (*seq > max_seq ||
 400                     (ret = list_first_entry_or_null(&pin_list->list,
 401                                 struct journal_entry_pin, list)))
 402                         break;
 403
 404         if (ret) {
 405                 list_move(&ret->list, &pin_list->flushed);
 406                 BUG_ON(j->flush_in_progress);
 407                 j->flush_in_progress = ret;
 408                 j->last_flushed = jiffies;
 409         }
 410
 411         spin_unlock(&j->lock);
 412
 413         return ret;
 414 }
 415
 416 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 417                                unsigned min_nr)
 418 {
 419         struct journal_entry_pin *pin;
 420         u64 seq;
 421
 422         lockdep_assert_held(&j->reclaim_lock);
 423
 424         while ((pin = journal_get_next_pin(j, min_nr
 425                                 ? U64_MAX : seq_to_flush, &seq))) {
 426                 if (min_nr)
 427                         min_nr--;
 428
 429                 pin->flush(j, pin, seq);
 430
 431                 BUG_ON(j->flush_in_progress != pin);
 432                 j->flush_in_progress = NULL;
 433                 wake_up(&j->pin_flush_wait);
 434         }
 435 }
 436
 437 /**
 438  * bch2_journal_reclaim - free up journal buckets
 439  *
 440  * Background journal reclaim writes out btree nodes. It should be run
 441  * early enough so that we never completely run out of journal buckets.
 442  *
 443  * High watermarks for triggering background reclaim:
 444  * - FIFO has fewer than 512 entries left
 445  * - fewer than 25% journal buckets free
 446  *
 447  * Background reclaim runs until low watermarks are reached:
 448  * - FIFO has more than 1024 entries left
 449  * - more than 50% journal buckets free
 450  *
 451  * As long as a reclaim can complete in the time it takes to fill up
 452  * 512 journal entries or 25% of all journal buckets, then
 453  * journal_next_bucket() should not stall.
 454  */
 455 void bch2_journal_reclaim(struct journal *j)
 456 {
 457         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 458         struct bch_dev *ca;
 459         unsigned iter, min_nr = 0;
 460         u64 seq_to_flush = 0;
 461
 462         lockdep_assert_held(&j->reclaim_lock);
 463
 464         bch2_journal_do_discards(j);
 465
 466         spin_lock(&j->lock);
 467
 468         for_each_rw_member(ca, c, iter) {
 469                 struct journal_device *ja = &ca->journal;
 470                 unsigned nr_buckets, bucket_to_flush;
 471
 472                 if (!ja->nr)
 473                         continue;
 474
 475                 /* Try to keep the journal at most half full: */
 476                 nr_buckets = ja->nr / 2;
 477
 478                 /* And include pre-reservations: */
 479                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
 480                                            (ca->mi.bucket_size << 6) -
 481                                            journal_entry_overhead(j));
 482
 483                 nr_buckets = min(nr_buckets, ja->nr);
 484
 485                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 486                 seq_to_flush = max(seq_to_flush,
 487                                    ja->bucket_seq[bucket_to_flush]);
 488         }
 489
 490         /* Also flush if the pin fifo is more than half full */
 491         seq_to_flush = max_t(s64, seq_to_flush,
 492                              (s64) journal_cur_seq(j) -
 493                              (j->pin.size >> 1));
 494         spin_unlock(&j->lock);
 495
 496         /*
 497          * If it's been longer than j->reclaim_delay_ms since we last flushed,
 498          * make sure to flush at least one journal pin:
 499          */
 500         if (time_after(jiffies, j->last_flushed +
 501                        msecs_to_jiffies(j->reclaim_delay_ms)))
 502                 min_nr = 1;
 503
 504         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
 505                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
 506                 min_nr = 1;
 507         }
 508
 509         journal_flush_pins(j, seq_to_flush, min_nr);
 510
 511         if (!bch2_journal_error(j))
 512                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 513                                    msecs_to_jiffies(j->reclaim_delay_ms));
 514 }
 515
 516 void bch2_journal_reclaim_work(struct work_struct *work)
 517 {
 518         struct journal *j = container_of(to_delayed_work(work),
 519                                 struct journal, reclaim_work);
 520
 521         mutex_lock(&j->reclaim_lock);
 522         bch2_journal_reclaim(j);
 523         mutex_unlock(&j->reclaim_lock);
 524 }
 525
 526 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 527 {
 528         int ret;
 529
 530         ret = bch2_journal_error(j);
 531         if (ret)
 532                 return ret;
 533
 534         mutex_lock(&j->reclaim_lock);
 535
 536         journal_flush_pins(j, seq_to_flush, 0);
 537
 538         spin_lock(&j->lock);
 539         /*
 540          * If journal replay hasn't completed, the unreplayed journal entries
 541          * hold refs on their corresponding sequence numbers
 542          */
 543         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 544                 journal_last_seq(j) > seq_to_flush ||
 545                 (fifo_used(&j->pin) == 1 &&
 546                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 547
 548         spin_unlock(&j->lock);
 549         mutex_unlock(&j->reclaim_lock);
 550
 551         return ret;
 552 }
 553
 554 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 555 {
 556         if (!test_bit(JOURNAL_STARTED, &j->flags))
 557                 return;
 558
 559         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
 560 }
 561
 562 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 563 {
 564         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 565         struct journal_entry_pin_list *p;
 566         u64 iter, seq = 0;
 567         int ret = 0;
 568
 569         spin_lock(&j->lock);
 570         fifo_for_each_entry_ptr(p, &j->pin, iter)
 571                 if (dev_idx >= 0
 572                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 573                     : p->devs.nr < c->opts.metadata_replicas)
 574                         seq = iter;
 575         spin_unlock(&j->lock);
 576
 577         bch2_journal_flush_pins(j, seq);
 578
 579         ret = bch2_journal_error(j);
 580         if (ret)
 581                 return ret;
 582
 583         mutex_lock(&c->replicas_gc_lock);
 584         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
 585
 586         seq = 0;
 587
 588         spin_lock(&j->lock);
 589         while (!ret && seq < j->pin.back) {
 590                 struct bch_replicas_padded replicas;
 591
 592                 seq = max(seq, journal_last_seq(j));
 593                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
 594                                          journal_seq_pin(j, seq)->devs);
 595                 seq++;
 596
 597                 spin_unlock(&j->lock);
 598                 ret = bch2_mark_replicas(c, &replicas.e);
 599                 spin_lock(&j->lock);
 600         }
 601         spin_unlock(&j->lock);
 602
 603         ret = bch2_replicas_gc_end(c, ret);
 604         mutex_unlock(&c->replicas_gc_lock);
 605
 606         return ret;
 607 }