git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "bcachefs.h"
   4 #include "journal.h"
   5 #include "journal_io.h"
   6 #include "journal_reclaim.h"
   7 #include "replicas.h"
   8 #include "super.h"
   9
  10 /* Free space calculations: */
  11
  12 static unsigned journal_space_from(struct journal_device *ja,
  13                                    enum journal_space_from from)
  14 {
  15         switch (from) {
  16         case journal_space_discarded:
  17                 return ja->discard_idx;
  18         case journal_space_clean_ondisk:
  19                 return ja->dirty_idx_ondisk;
  20         case journal_space_clean:
  21                 return ja->dirty_idx;
  22         default:
  23                 BUG();
  24         }
  25 }
  26
  27 unsigned bch2_journal_dev_buckets_available(struct journal *j,
  28                                             struct journal_device *ja,
  29                                             enum journal_space_from from)
  30 {
  31         unsigned available = (journal_space_from(ja, from) -
  32                               ja->cur_idx - 1 + ja->nr) % ja->nr;
  33
  34         /*
  35          * Don't use the last bucket unless writing the new last_seq
  36          * will make another bucket available:
  37          */
  38         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
  39                 --available;
  40
  41         return available;
  42 }
  43
  44 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
  45 {
  46         union journal_preres_state old, new;
  47         u64 v = atomic64_read(&j->prereserved.counter);
  48
  49         do {
  50                 old.v = new.v = v;
  51                 new.remaining = u64s_remaining;
  52         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
  53                                        old.v, new.v)) != old.v);
  54 }
  55
  56 static struct journal_space {
  57         unsigned        next_entry;
  58         unsigned        remaining;
  59 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
  60                             enum journal_space_from from)
  61 {
  62         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  63         struct bch_dev *ca;
  64         unsigned sectors_next_entry     = UINT_MAX;
  65         unsigned sectors_total          = UINT_MAX;
  66         unsigned i, nr_devs = 0;
  67         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
  68                 ? journal_prev_buf(j)->sectors
  69                 : 0;
  70
  71         rcu_read_lock();
  72         for_each_member_device_rcu(ca, c, i,
  73                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
  74                 struct journal_device *ja = &ca->journal;
  75                 unsigned buckets_this_device, sectors_this_device;
  76
  77                 if (!ja->nr)
  78                         continue;
  79
  80                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
  81                 sectors_this_device = ja->sectors_free;
  82
  83                 /*
  84                  * We that we don't allocate the space for a journal entry
  85                  * until we write it out - thus, account for it here:
  86                  */
  87                 if (unwritten_sectors >= sectors_this_device) {
  88                         if (!buckets_this_device)
  89                                 continue;
  90
  91                         buckets_this_device--;
  92                         sectors_this_device = ca->mi.bucket_size;
  93                 }
  94
  95                 sectors_this_device -= unwritten_sectors;
  96
  97                 if (sectors_this_device < ca->mi.bucket_size &&
  98                     buckets_this_device) {
  99                         buckets_this_device--;
 100                         sectors_this_device = ca->mi.bucket_size;
 101                 }
 102
 103                 if (!sectors_this_device)
 104                         continue;
 105
 106                 sectors_next_entry = min(sectors_next_entry,
 107                                          sectors_this_device);
 108
 109                 sectors_total = min(sectors_total,
 110                         buckets_this_device * ca->mi.bucket_size +
 111                         sectors_this_device);
 112
 113                 nr_devs++;
 114         }
 115         rcu_read_unlock();
 116
 117         if (nr_devs < nr_devs_want)
 118                 return (struct journal_space) { 0, 0 };
 119
 120         return (struct journal_space) {
 121                 .next_entry     = sectors_next_entry,
 122                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
 123         };
 124 }
 125
 126 void bch2_journal_space_available(struct journal *j)
 127 {
 128         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 129         struct bch_dev *ca;
 130         struct journal_space discarded, clean_ondisk, clean;
 131         unsigned overhead, u64s_remaining = 0;
 132         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
 133                                        j->buf[1].buf_size >> 9);
 134         unsigned i, nr_online = 0, nr_devs_want;
 135         bool can_discard = false;
 136         int ret = 0;
 137
 138         lockdep_assert_held(&j->lock);
 139
 140         rcu_read_lock();
 141         for_each_member_device_rcu(ca, c, i,
 142                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
 143                 struct journal_device *ja = &ca->journal;
 144
 145                 if (!ja->nr)
 146                         continue;
 147
 148                 while (ja->dirty_idx != ja->cur_idx &&
 149                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
 150                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 151
 152                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
 153                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 154                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 155
 156                 if (ja->discard_idx != ja->dirty_idx_ondisk)
 157                         can_discard = true;
 158
 159                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 160                 nr_online++;
 161         }
 162         rcu_read_unlock();
 163
 164         j->can_discard = can_discard;
 165
 166         if (nr_online < c->opts.metadata_replicas_required) {
 167                 ret = -EROFS;
 168                 goto out;
 169         }
 170
 171         if (!fifo_free(&j->pin)) {
 172                 ret = -ENOSPC;
 173                 goto out;
 174         }
 175
 176         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 177
 178         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
 179         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
 180         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
 181
 182         if (!discarded.next_entry)
 183                 ret = -ENOSPC;
 184
 185         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
 186                 journal_entry_overhead(j);
 187         u64s_remaining = clean.remaining << 6;
 188         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
 189         u64s_remaining /= 4;
 190 out:
 191         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
 192         j->cur_entry_error      = ret;
 193         journal_set_remaining(j, u64s_remaining);
 194         journal_check_may_get_unreserved(j);
 195
 196         if (!ret)
 197                 journal_wake(j);
 198 }
 199
 200 /* Discards - last part of journal reclaim: */
 201
 202 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 203 {
 204         bool ret;
 205
 206         spin_lock(&j->lock);
 207         ret = ja->discard_idx != ja->dirty_idx_ondisk;
 208         spin_unlock(&j->lock);
 209
 210         return ret;
 211 }
 212
 213 /*
 214  * Advance ja->discard_idx as long as it points to buckets that are no longer
 215  * dirty, issuing discards if necessary:
 216  */
 217 void bch2_journal_do_discards(struct journal *j)
 218 {
 219         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 220         struct bch_dev *ca;
 221         unsigned iter;
 222
 223         mutex_lock(&j->discard_lock);
 224
 225         for_each_rw_member(ca, c, iter) {
 226                 struct journal_device *ja = &ca->journal;
 227
 228                 while (should_discard_bucket(j, ja)) {
 229                         if (ca->mi.discard &&
 230                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 231                                 blkdev_issue_discard(ca->disk_sb.bdev,
 232                                         bucket_to_sector(ca,
 233                                                 ja->buckets[ja->discard_idx]),
 234                                         ca->mi.bucket_size, GFP_NOIO, 0);
 235
 236                         spin_lock(&j->lock);
 237                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
 238
 239                         bch2_journal_space_available(j);
 240                         spin_unlock(&j->lock);
 241                 }
 242         }
 243
 244         mutex_unlock(&j->discard_lock);
 245 }
 246
 247 /*
 248  * Journal entry pinning - machinery for holding a reference on a given journal
 249  * entry, holding it open to ensure it gets replayed during recovery:
 250  */
 251
 252 static void bch2_journal_reclaim_fast(struct journal *j)
 253 {
 254         struct journal_entry_pin_list temp;
 255         bool popped = false;
 256
 257         lockdep_assert_held(&j->lock);
 258
 259         /*
 260          * Unpin journal entries whose reference counts reached zero, meaning
 261          * all btree nodes got written out
 262          */
 263         while (!fifo_empty(&j->pin) &&
 264                !atomic_read(&fifo_peek_front(&j->pin).count)) {
 265                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 266                 BUG_ON(!fifo_pop(&j->pin, temp));
 267                 popped = true;
 268         }
 269
 270         if (popped)
 271                 bch2_journal_space_available(j);
 272 }
 273
 274 void bch2_journal_pin_put(struct journal *j, u64 seq)
 275 {
 276         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 277
 278         if (atomic_dec_and_test(&pin_list->count)) {
 279                 spin_lock(&j->lock);
 280                 bch2_journal_reclaim_fast(j);
 281                 spin_unlock(&j->lock);
 282         }
 283 }
 284
 285 static inline void __journal_pin_drop(struct journal *j,
 286                                       struct journal_entry_pin *pin)
 287 {
 288         struct journal_entry_pin_list *pin_list;
 289
 290         if (!journal_pin_active(pin))
 291                 return;
 292
 293         pin_list = journal_seq_pin(j, pin->seq);
 294         pin->seq = 0;
 295         list_del_init(&pin->list);
 296
 297         /*
 298          * Unpinning a journal entry make make journal_next_bucket() succeed, if
 299          * writing a new last_seq will now make another bucket available:
 300          */
 301         if (atomic_dec_and_test(&pin_list->count) &&
 302             pin_list == &fifo_peek_front(&j->pin))
 303                 bch2_journal_reclaim_fast(j);
 304         else if (fifo_used(&j->pin) == 1 &&
 305                  atomic_read(&pin_list->count) == 1)
 306                 journal_wake(j);
 307 }
 308
 309 void bch2_journal_pin_drop(struct journal *j,
 310                            struct journal_entry_pin *pin)
 311 {
 312         spin_lock(&j->lock);
 313         __journal_pin_drop(j, pin);
 314         spin_unlock(&j->lock);
 315 }
 316
 317 static void bch2_journal_pin_add_locked(struct journal *j, u64 seq,
 318                             struct journal_entry_pin *pin,
 319                             journal_pin_flush_fn flush_fn)
 320 {
 321         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 322
 323         __journal_pin_drop(j, pin);
 324
 325         BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j));
 326
 327         atomic_inc(&pin_list->count);
 328         pin->seq        = seq;
 329         pin->flush      = flush_fn;
 330
 331         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 332 }
 333
 334 void __bch2_journal_pin_add(struct journal *j, u64 seq,
 335                             struct journal_entry_pin *pin,
 336                             journal_pin_flush_fn flush_fn)
 337 {
 338         spin_lock(&j->lock);
 339         bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
 340         spin_unlock(&j->lock);
 341
 342         /*
 343          * If the journal is currently full,  we might want to call flush_fn
 344          * immediately:
 345          */
 346         journal_wake(j);
 347 }
 348
 349 void bch2_journal_pin_update(struct journal *j, u64 seq,
 350                              struct journal_entry_pin *pin,
 351                              journal_pin_flush_fn flush_fn)
 352 {
 353         if (journal_pin_active(pin) && pin->seq < seq)
 354                 return;
 355
 356         spin_lock(&j->lock);
 357
 358         if (pin->seq != seq) {
 359                 bch2_journal_pin_add_locked(j, seq, pin, flush_fn);
 360         } else {
 361                 struct journal_entry_pin_list *pin_list =
 362                         journal_seq_pin(j, seq);
 363
 364                 /*
 365                  * If the pin is already pinning the right sequence number, it
 366                  * still might've already been flushed:
 367                  */
 368                 list_move(&pin->list, &pin_list->list);
 369         }
 370
 371         spin_unlock(&j->lock);
 372
 373         /*
 374          * If the journal is currently full,  we might want to call flush_fn
 375          * immediately:
 376          */
 377         journal_wake(j);
 378 }
 379
 380 void bch2_journal_pin_copy(struct journal *j,
 381                            struct journal_entry_pin *dst,
 382                            struct journal_entry_pin *src,
 383                            journal_pin_flush_fn flush_fn)
 384 {
 385         spin_lock(&j->lock);
 386
 387         if (journal_pin_active(src) &&
 388             (!journal_pin_active(dst) || src->seq < dst->seq))
 389                 bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn);
 390
 391         spin_unlock(&j->lock);
 392 }
 393
 394 /**
 395  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
 396  */
 397 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 398 {
 399         BUG_ON(journal_pin_active(pin));
 400
 401         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
 402 }
 403
 404 /*
 405  * Journal reclaim: flush references to open journal entries to reclaim space in
 406  * the journal
 407  *
 408  * May be done by the journal code in the background as needed to free up space
 409  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 410  * data off of a specific device:
 411  */
 412
 413 static struct journal_entry_pin *
 414 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 415 {
 416         struct journal_entry_pin_list *pin_list;
 417         struct journal_entry_pin *ret = NULL;
 418
 419         if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
 420                 return NULL;
 421
 422         spin_lock(&j->lock);
 423
 424         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 425                 if (*seq > max_seq ||
 426                     (ret = list_first_entry_or_null(&pin_list->list,
 427                                 struct journal_entry_pin, list)))
 428                         break;
 429
 430         if (ret) {
 431                 list_move(&ret->list, &pin_list->flushed);
 432                 BUG_ON(j->flush_in_progress);
 433                 j->flush_in_progress = ret;
 434                 j->last_flushed = jiffies;
 435         }
 436
 437         spin_unlock(&j->lock);
 438
 439         return ret;
 440 }
 441
 442 /* returns true if we did work */
 443 static bool journal_flush_pins(struct journal *j, u64 seq_to_flush,
 444                                unsigned min_nr)
 445 {
 446         struct journal_entry_pin *pin;
 447         bool ret = false;
 448         u64 seq;
 449
 450         lockdep_assert_held(&j->reclaim_lock);
 451
 452         while ((pin = journal_get_next_pin(j, min_nr
 453                                 ? U64_MAX : seq_to_flush, &seq))) {
 454                 if (min_nr)
 455                         min_nr--;
 456
 457                 pin->flush(j, pin, seq);
 458
 459                 BUG_ON(j->flush_in_progress != pin);
 460                 j->flush_in_progress = NULL;
 461                 wake_up(&j->pin_flush_wait);
 462                 ret = true;
 463         }
 464
 465         return ret;
 466 }
 467
 468 /**
 469  * bch2_journal_reclaim - free up journal buckets
 470  *
 471  * Background journal reclaim writes out btree nodes. It should be run
 472  * early enough so that we never completely run out of journal buckets.
 473  *
 474  * High watermarks for triggering background reclaim:
 475  * - FIFO has fewer than 512 entries left
 476  * - fewer than 25% journal buckets free
 477  *
 478  * Background reclaim runs until low watermarks are reached:
 479  * - FIFO has more than 1024 entries left
 480  * - more than 50% journal buckets free
 481  *
 482  * As long as a reclaim can complete in the time it takes to fill up
 483  * 512 journal entries or 25% of all journal buckets, then
 484  * journal_next_bucket() should not stall.
 485  */
 486 void bch2_journal_reclaim(struct journal *j)
 487 {
 488         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 489         struct bch_dev *ca;
 490         unsigned iter, min_nr = 0;
 491         u64 seq_to_flush = 0;
 492
 493         lockdep_assert_held(&j->reclaim_lock);
 494
 495         bch2_journal_do_discards(j);
 496
 497         spin_lock(&j->lock);
 498
 499         for_each_rw_member(ca, c, iter) {
 500                 struct journal_device *ja = &ca->journal;
 501                 unsigned nr_buckets, bucket_to_flush;
 502
 503                 if (!ja->nr)
 504                         continue;
 505
 506                 /* Try to keep the journal at most half full: */
 507                 nr_buckets = ja->nr / 2;
 508
 509                 /* And include pre-reservations: */
 510                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
 511                                            (ca->mi.bucket_size << 6) -
 512                                            journal_entry_overhead(j));
 513
 514                 nr_buckets = min(nr_buckets, ja->nr);
 515
 516                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 517                 seq_to_flush = max(seq_to_flush,
 518                                    ja->bucket_seq[bucket_to_flush]);
 519         }
 520
 521         /* Also flush if the pin fifo is more than half full */
 522         seq_to_flush = max_t(s64, seq_to_flush,
 523                              (s64) journal_cur_seq(j) -
 524                              (j->pin.size >> 1));
 525         spin_unlock(&j->lock);
 526
 527         /*
 528          * If it's been longer than j->reclaim_delay_ms since we last flushed,
 529          * make sure to flush at least one journal pin:
 530          */
 531         if (time_after(jiffies, j->last_flushed +
 532                        msecs_to_jiffies(j->reclaim_delay_ms)))
 533                 min_nr = 1;
 534
 535         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
 536                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
 537                 min_nr = 1;
 538         }
 539
 540         journal_flush_pins(j, seq_to_flush, min_nr);
 541
 542         if (!bch2_journal_error(j))
 543                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 544                                    msecs_to_jiffies(j->reclaim_delay_ms));
 545 }
 546
 547 void bch2_journal_reclaim_work(struct work_struct *work)
 548 {
 549         struct journal *j = container_of(to_delayed_work(work),
 550                                 struct journal, reclaim_work);
 551
 552         mutex_lock(&j->reclaim_lock);
 553         bch2_journal_reclaim(j);
 554         mutex_unlock(&j->reclaim_lock);
 555 }
 556
 557 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 558                               bool *did_work)
 559 {
 560         int ret;
 561
 562         ret = bch2_journal_error(j);
 563         if (ret)
 564                 return ret;
 565
 566         mutex_lock(&j->reclaim_lock);
 567
 568         *did_work = journal_flush_pins(j, seq_to_flush, 0);
 569
 570         spin_lock(&j->lock);
 571         /*
 572          * If journal replay hasn't completed, the unreplayed journal entries
 573          * hold refs on their corresponding sequence numbers
 574          */
 575         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 576                 journal_last_seq(j) > seq_to_flush ||
 577                 (fifo_used(&j->pin) == 1 &&
 578                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 579
 580         spin_unlock(&j->lock);
 581         mutex_unlock(&j->reclaim_lock);
 582
 583         return ret;
 584 }
 585
 586 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 587 {
 588         bool did_work = false;
 589
 590         if (!test_bit(JOURNAL_STARTED, &j->flags))
 591                 return false;
 592
 593         closure_wait_event(&j->async_wait,
 594                 journal_flush_done(j, seq_to_flush, &did_work));
 595
 596         return did_work;
 597 }
 598
 599 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 600 {
 601         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 602         struct journal_entry_pin_list *p;
 603         u64 iter, seq = 0;
 604         int ret = 0;
 605
 606         spin_lock(&j->lock);
 607         fifo_for_each_entry_ptr(p, &j->pin, iter)
 608                 if (dev_idx >= 0
 609                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 610                     : p->devs.nr < c->opts.metadata_replicas)
 611                         seq = iter;
 612         spin_unlock(&j->lock);
 613
 614         bch2_journal_flush_pins(j, seq);
 615
 616         ret = bch2_journal_error(j);
 617         if (ret)
 618                 return ret;
 619
 620         mutex_lock(&c->replicas_gc_lock);
 621         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
 622
 623         seq = 0;
 624
 625         spin_lock(&j->lock);
 626         while (!ret && seq < j->pin.back) {
 627                 struct bch_replicas_padded replicas;
 628
 629                 seq = max(seq, journal_last_seq(j));
 630                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
 631                                          journal_seq_pin(j, seq)->devs);
 632                 seq++;
 633
 634                 spin_unlock(&j->lock);
 635                 ret = bch2_mark_replicas(c, &replicas.e);
 636                 spin_lock(&j->lock);
 637         }
 638         spin_unlock(&j->lock);
 639
 640         ret = bch2_replicas_gc_end(c, ret);
 641         mutex_unlock(&c->replicas_gc_lock);
 642
 643         return ret;
 644 }