git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1
   2 #include "bcachefs.h"
   3 #include "journal.h"
   4 #include "journal_io.h"
   5 #include "journal_reclaim.h"
   6 #include "replicas.h"
   7 #include "super.h"
   8
   9 /* Free space calculations: */
  10
  11 static unsigned journal_space_from(struct journal_device *ja,
  12                                    enum journal_space_from from)
  13 {
  14         switch (from) {
  15         case journal_space_discarded:
  16                 return ja->discard_idx;
  17         case journal_space_clean_ondisk:
  18                 return ja->dirty_idx_ondisk;
  19         case journal_space_clean:
  20                 return ja->dirty_idx;
  21         default:
  22                 BUG();
  23         }
  24 }
  25
  26 unsigned bch2_journal_dev_buckets_available(struct journal *j,
  27                                             struct journal_device *ja,
  28                                             enum journal_space_from from)
  29 {
  30         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  31         unsigned available = (journal_space_from(ja, from) -
  32                               ja->cur_idx - 1 + ja->nr) % ja->nr;
  33
  34         /*
  35          * Allocator startup needs some journal space before we can do journal
  36          * replay:
  37          */
  38         if (available && test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
  39                 --available;
  40
  41         /*
  42          * Don't use the last bucket unless writing the new last_seq
  43          * will make another bucket available:
  44          */
  45         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
  46                 --available;
  47
  48         return available;
  49 }
  50
  51 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
  52 {
  53         union journal_preres_state old, new;
  54         u64 v = atomic64_read(&j->prereserved.counter);
  55
  56         do {
  57                 old.v = new.v = v;
  58                 new.remaining = u64s_remaining;
  59         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
  60                                        old.v, new.v)) != old.v);
  61 }
  62
  63 static struct journal_space {
  64         unsigned        next_entry;
  65         unsigned        remaining;
  66 } __journal_space_available(struct journal *j, unsigned nr_devs_want,
  67                             enum journal_space_from from)
  68 {
  69         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  70         struct bch_dev *ca;
  71         unsigned sectors_next_entry     = UINT_MAX;
  72         unsigned sectors_total          = UINT_MAX;
  73         unsigned i, nr_devs = 0;
  74         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
  75                 ? journal_prev_buf(j)->sectors
  76                 : 0;
  77
  78         rcu_read_lock();
  79         for_each_member_device_rcu(ca, c, i,
  80                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
  81                 struct journal_device *ja = &ca->journal;
  82                 unsigned buckets_this_device, sectors_this_device;
  83
  84                 if (!ja->nr)
  85                         continue;
  86
  87                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from);
  88                 sectors_this_device = ja->sectors_free;
  89
  90                 /*
  91                  * We that we don't allocate the space for a journal entry
  92                  * until we write it out - thus, account for it here:
  93                  */
  94                 if (unwritten_sectors >= sectors_this_device) {
  95                         if (!buckets_this_device)
  96                                 continue;
  97
  98                         buckets_this_device--;
  99                         sectors_this_device = ca->mi.bucket_size;
 100                 }
 101
 102                 sectors_this_device -= unwritten_sectors;
 103
 104                 if (sectors_this_device < ca->mi.bucket_size &&
 105                     buckets_this_device) {
 106                         buckets_this_device--;
 107                         sectors_this_device = ca->mi.bucket_size;
 108                 }
 109
 110                 if (!sectors_this_device)
 111                         continue;
 112
 113                 sectors_next_entry = min(sectors_next_entry,
 114                                          sectors_this_device);
 115
 116                 sectors_total = min(sectors_total,
 117                         buckets_this_device * ca->mi.bucket_size +
 118                         sectors_this_device);
 119
 120                 nr_devs++;
 121         }
 122         rcu_read_unlock();
 123
 124         if (nr_devs < nr_devs_want)
 125                 return (struct journal_space) { 0, 0 };
 126
 127         return (struct journal_space) {
 128                 .next_entry     = sectors_next_entry,
 129                 .remaining      = max_t(int, 0, sectors_total - sectors_next_entry),
 130         };
 131 }
 132
 133 void bch2_journal_space_available(struct journal *j)
 134 {
 135         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 136         struct bch_dev *ca;
 137         struct journal_space discarded, clean_ondisk, clean;
 138         unsigned overhead, u64s_remaining = 0;
 139         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
 140                                        j->buf[1].buf_size >> 9);
 141         unsigned i, nr_online = 0, nr_devs_want;
 142         bool can_discard = false;
 143         int ret = 0;
 144
 145         lockdep_assert_held(&j->lock);
 146
 147         rcu_read_lock();
 148         for_each_member_device_rcu(ca, c, i,
 149                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
 150                 struct journal_device *ja = &ca->journal;
 151
 152                 if (!ja->nr)
 153                         continue;
 154
 155                 while (ja->dirty_idx != ja->cur_idx &&
 156                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
 157                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 158
 159                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
 160                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 161                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 162
 163                 if (ja->discard_idx != ja->dirty_idx_ondisk)
 164                         can_discard = true;
 165
 166                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 167                 nr_online++;
 168         }
 169         rcu_read_unlock();
 170
 171         j->can_discard = can_discard;
 172
 173         if (nr_online < c->opts.metadata_replicas_required) {
 174                 ret = -EROFS;
 175                 goto out;
 176         }
 177
 178         if (!fifo_free(&j->pin)) {
 179                 ret = -ENOSPC;
 180                 goto out;
 181         }
 182
 183         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 184
 185         discarded       = __journal_space_available(j, nr_devs_want, journal_space_discarded);
 186         clean_ondisk    = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk);
 187         clean           = __journal_space_available(j, nr_devs_want, journal_space_clean);
 188
 189         if (!discarded.next_entry)
 190                 ret = -ENOSPC;
 191
 192         overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) *
 193                 journal_entry_overhead(j);
 194         u64s_remaining = clean.remaining << 6;
 195         u64s_remaining = max_t(int, 0, u64s_remaining - overhead);
 196         u64s_remaining /= 4;
 197 out:
 198         j->cur_entry_sectors    = !ret ? discarded.next_entry : 0;
 199         j->cur_entry_error      = ret;
 200         journal_set_remaining(j, u64s_remaining);
 201         journal_check_may_get_unreserved(j);
 202
 203         if (!ret)
 204                 journal_wake(j);
 205 }
 206
 207 /* Discards - last part of journal reclaim: */
 208
 209 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 210 {
 211         bool ret;
 212
 213         spin_lock(&j->lock);
 214         ret = ja->discard_idx != ja->dirty_idx_ondisk;
 215         spin_unlock(&j->lock);
 216
 217         return ret;
 218 }
 219
 220 /*
 221  * Advance ja->discard_idx as long as it points to buckets that are no longer
 222  * dirty, issuing discards if necessary:
 223  */
 224 void bch2_journal_do_discards(struct journal *j)
 225 {
 226         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 227         struct bch_dev *ca;
 228         unsigned iter;
 229
 230         mutex_lock(&j->discard_lock);
 231
 232         for_each_rw_member(ca, c, iter) {
 233                 struct journal_device *ja = &ca->journal;
 234
 235                 while (should_discard_bucket(j, ja)) {
 236                         if (ca->mi.discard &&
 237                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 238                                 blkdev_issue_discard(ca->disk_sb.bdev,
 239                                         bucket_to_sector(ca,
 240                                                 ja->buckets[ja->discard_idx]),
 241                                         ca->mi.bucket_size, GFP_NOIO, 0);
 242
 243                         spin_lock(&j->lock);
 244                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
 245
 246                         bch2_journal_space_available(j);
 247                         spin_unlock(&j->lock);
 248                 }
 249         }
 250
 251         mutex_unlock(&j->discard_lock);
 252 }
 253
 254 /*
 255  * Journal entry pinning - machinery for holding a reference on a given journal
 256  * entry, holding it open to ensure it gets replayed during recovery:
 257  */
 258
 259 static void bch2_journal_reclaim_fast(struct journal *j)
 260 {
 261         struct journal_entry_pin_list temp;
 262         bool popped = false;
 263
 264         lockdep_assert_held(&j->lock);
 265
 266         /*
 267          * Unpin journal entries whose reference counts reached zero, meaning
 268          * all btree nodes got written out
 269          */
 270         while (!fifo_empty(&j->pin) &&
 271                !atomic_read(&fifo_peek_front(&j->pin).count)) {
 272                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 273                 BUG_ON(!fifo_pop(&j->pin, temp));
 274                 popped = true;
 275         }
 276
 277         if (popped)
 278                 bch2_journal_space_available(j);
 279 }
 280
 281 void bch2_journal_pin_put(struct journal *j, u64 seq)
 282 {
 283         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 284
 285         if (atomic_dec_and_test(&pin_list->count)) {
 286                 spin_lock(&j->lock);
 287                 bch2_journal_reclaim_fast(j);
 288                 spin_unlock(&j->lock);
 289         }
 290 }
 291
 292 static inline void __journal_pin_add(struct journal *j,
 293                                      u64 seq,
 294                                      struct journal_entry_pin *pin,
 295                                      journal_pin_flush_fn flush_fn)
 296 {
 297         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 298
 299         BUG_ON(journal_pin_active(pin));
 300         BUG_ON(!atomic_read(&pin_list->count));
 301
 302         atomic_inc(&pin_list->count);
 303         pin->seq        = seq;
 304         pin->flush      = flush_fn;
 305
 306         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 307
 308         /*
 309          * If the journal is currently full,  we might want to call flush_fn
 310          * immediately:
 311          */
 312         journal_wake(j);
 313 }
 314
 315 void bch2_journal_pin_add(struct journal *j, u64 seq,
 316                           struct journal_entry_pin *pin,
 317                           journal_pin_flush_fn flush_fn)
 318 {
 319         spin_lock(&j->lock);
 320         __journal_pin_add(j, seq, pin, flush_fn);
 321         spin_unlock(&j->lock);
 322 }
 323
 324 static inline void __journal_pin_drop(struct journal *j,
 325                                       struct journal_entry_pin *pin)
 326 {
 327         struct journal_entry_pin_list *pin_list;
 328
 329         if (!journal_pin_active(pin))
 330                 return;
 331
 332         pin_list = journal_seq_pin(j, pin->seq);
 333         pin->seq = 0;
 334         list_del_init(&pin->list);
 335
 336         /*
 337          * Unpinning a journal entry make make journal_next_bucket() succeed, if
 338          * writing a new last_seq will now make another bucket available:
 339          */
 340         if (atomic_dec_and_test(&pin_list->count) &&
 341             pin_list == &fifo_peek_front(&j->pin))
 342                 bch2_journal_reclaim_fast(j);
 343         else if (fifo_used(&j->pin) == 1 &&
 344                  atomic_read(&pin_list->count) == 1)
 345                 journal_wake(j);
 346 }
 347
 348 void bch2_journal_pin_drop(struct journal *j,
 349                            struct journal_entry_pin *pin)
 350 {
 351         spin_lock(&j->lock);
 352         __journal_pin_drop(j, pin);
 353         spin_unlock(&j->lock);
 354 }
 355
 356 void bch2_journal_pin_update(struct journal *j, u64 seq,
 357                              struct journal_entry_pin *pin,
 358                              journal_pin_flush_fn flush_fn)
 359 {
 360         spin_lock(&j->lock);
 361
 362         if (pin->seq != seq) {
 363                 __journal_pin_drop(j, pin);
 364                 __journal_pin_add(j, seq, pin, flush_fn);
 365         } else {
 366                 struct journal_entry_pin_list *pin_list =
 367                         journal_seq_pin(j, seq);
 368
 369                 list_move(&pin->list, &pin_list->list);
 370         }
 371
 372         spin_unlock(&j->lock);
 373 }
 374
 375 void bch2_journal_pin_add_if_older(struct journal *j,
 376                                   struct journal_entry_pin *src_pin,
 377                                   struct journal_entry_pin *pin,
 378                                   journal_pin_flush_fn flush_fn)
 379 {
 380         spin_lock(&j->lock);
 381
 382         if (journal_pin_active(src_pin) &&
 383             (!journal_pin_active(pin) ||
 384              src_pin->seq < pin->seq)) {
 385                 __journal_pin_drop(j, pin);
 386                 __journal_pin_add(j, src_pin->seq, pin, flush_fn);
 387         }
 388
 389         spin_unlock(&j->lock);
 390 }
 391
 392 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 393 {
 394         BUG_ON(journal_pin_active(pin));
 395
 396         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
 397 }
 398
 399 /*
 400  * Journal reclaim: flush references to open journal entries to reclaim space in
 401  * the journal
 402  *
 403  * May be done by the journal code in the background as needed to free up space
 404  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 405  * data off of a specific device:
 406  */
 407
 408 static struct journal_entry_pin *
 409 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 410 {
 411         struct journal_entry_pin_list *pin_list;
 412         struct journal_entry_pin *ret = NULL;
 413
 414         spin_lock(&j->lock);
 415
 416         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 417                 if (*seq > max_seq ||
 418                     (ret = list_first_entry_or_null(&pin_list->list,
 419                                 struct journal_entry_pin, list)))
 420                         break;
 421
 422         if (ret) {
 423                 list_move(&ret->list, &pin_list->flushed);
 424                 BUG_ON(j->flush_in_progress);
 425                 j->flush_in_progress = ret;
 426                 j->last_flushed = jiffies;
 427         }
 428
 429         spin_unlock(&j->lock);
 430
 431         return ret;
 432 }
 433
 434 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 435                                unsigned min_nr)
 436 {
 437         struct journal_entry_pin *pin;
 438         u64 seq;
 439
 440         lockdep_assert_held(&j->reclaim_lock);
 441
 442         while ((pin = journal_get_next_pin(j, min_nr
 443                                 ? U64_MAX : seq_to_flush, &seq))) {
 444                 if (min_nr)
 445                         min_nr--;
 446
 447                 pin->flush(j, pin, seq);
 448
 449                 BUG_ON(j->flush_in_progress != pin);
 450                 j->flush_in_progress = NULL;
 451                 wake_up(&j->pin_flush_wait);
 452         }
 453 }
 454
 455 /**
 456  * bch2_journal_reclaim - free up journal buckets
 457  *
 458  * Background journal reclaim writes out btree nodes. It should be run
 459  * early enough so that we never completely run out of journal buckets.
 460  *
 461  * High watermarks for triggering background reclaim:
 462  * - FIFO has fewer than 512 entries left
 463  * - fewer than 25% journal buckets free
 464  *
 465  * Background reclaim runs until low watermarks are reached:
 466  * - FIFO has more than 1024 entries left
 467  * - more than 50% journal buckets free
 468  *
 469  * As long as a reclaim can complete in the time it takes to fill up
 470  * 512 journal entries or 25% of all journal buckets, then
 471  * journal_next_bucket() should not stall.
 472  */
 473 void bch2_journal_reclaim(struct journal *j)
 474 {
 475         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 476         struct bch_dev *ca;
 477         unsigned iter, min_nr = 0;
 478         u64 seq_to_flush = 0;
 479
 480         lockdep_assert_held(&j->reclaim_lock);
 481
 482         bch2_journal_do_discards(j);
 483
 484         spin_lock(&j->lock);
 485
 486         for_each_rw_member(ca, c, iter) {
 487                 struct journal_device *ja = &ca->journal;
 488                 unsigned nr_buckets, bucket_to_flush;
 489
 490                 if (!ja->nr)
 491                         continue;
 492
 493                 /* Try to keep the journal at most half full: */
 494                 nr_buckets = ja->nr / 2;
 495
 496                 /* And include pre-reservations: */
 497                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
 498                                            (ca->mi.bucket_size << 6) -
 499                                            journal_entry_overhead(j));
 500
 501                 nr_buckets = min(nr_buckets, ja->nr);
 502
 503                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 504                 seq_to_flush = max(seq_to_flush,
 505                                    ja->bucket_seq[bucket_to_flush]);
 506         }
 507
 508         /* Also flush if the pin fifo is more than half full */
 509         seq_to_flush = max_t(s64, seq_to_flush,
 510                              (s64) journal_cur_seq(j) -
 511                              (j->pin.size >> 1));
 512         spin_unlock(&j->lock);
 513
 514         /*
 515          * If it's been longer than j->reclaim_delay_ms since we last flushed,
 516          * make sure to flush at least one journal pin:
 517          */
 518         if (time_after(jiffies, j->last_flushed +
 519                        msecs_to_jiffies(j->reclaim_delay_ms)))
 520                 min_nr = 1;
 521
 522         if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
 523                 seq_to_flush = max(seq_to_flush, journal_last_seq(j));
 524                 min_nr = 1;
 525         }
 526
 527         journal_flush_pins(j, seq_to_flush, min_nr);
 528
 529         if (!bch2_journal_error(j))
 530                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 531                                    msecs_to_jiffies(j->reclaim_delay_ms));
 532 }
 533
 534 void bch2_journal_reclaim_work(struct work_struct *work)
 535 {
 536         struct journal *j = container_of(to_delayed_work(work),
 537                                 struct journal, reclaim_work);
 538
 539         mutex_lock(&j->reclaim_lock);
 540         bch2_journal_reclaim(j);
 541         mutex_unlock(&j->reclaim_lock);
 542 }
 543
 544 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 545 {
 546         int ret;
 547
 548         ret = bch2_journal_error(j);
 549         if (ret)
 550                 return ret;
 551
 552         mutex_lock(&j->reclaim_lock);
 553
 554         journal_flush_pins(j, seq_to_flush, 0);
 555
 556         spin_lock(&j->lock);
 557         /*
 558          * If journal replay hasn't completed, the unreplayed journal entries
 559          * hold refs on their corresponding sequence numbers
 560          */
 561         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 562                 journal_last_seq(j) > seq_to_flush ||
 563                 (fifo_used(&j->pin) == 1 &&
 564                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 565
 566         spin_unlock(&j->lock);
 567         mutex_unlock(&j->reclaim_lock);
 568
 569         return ret;
 570 }
 571
 572 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 573 {
 574         if (!test_bit(JOURNAL_STARTED, &j->flags))
 575                 return;
 576
 577         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
 578 }
 579
 580 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 581 {
 582         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 583         struct journal_entry_pin_list *p;
 584         u64 iter, seq = 0;
 585         int ret = 0;
 586
 587         spin_lock(&j->lock);
 588         fifo_for_each_entry_ptr(p, &j->pin, iter)
 589                 if (dev_idx >= 0
 590                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 591                     : p->devs.nr < c->opts.metadata_replicas)
 592                         seq = iter;
 593         spin_unlock(&j->lock);
 594
 595         bch2_journal_flush_pins(j, seq);
 596
 597         ret = bch2_journal_error(j);
 598         if (ret)
 599                 return ret;
 600
 601         mutex_lock(&c->replicas_gc_lock);
 602         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
 603
 604         seq = 0;
 605
 606         spin_lock(&j->lock);
 607         while (!ret && seq < j->pin.back) {
 608                 struct bch_replicas_padded replicas;
 609
 610                 seq = max(seq, journal_last_seq(j));
 611                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
 612                                          journal_seq_pin(j, seq)->devs);
 613                 seq++;
 614
 615                 spin_unlock(&j->lock);
 616                 ret = bch2_mark_replicas(c, &replicas.e);
 617                 spin_lock(&j->lock);
 618         }
 619         spin_unlock(&j->lock);
 620
 621         ret = bch2_replicas_gc_end(c, ret);
 622         mutex_unlock(&c->replicas_gc_lock);
 623
 624         return ret;
 625 }