git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "bcachefs.h"
   4 #include "btree_key_cache.h"
   5 #include "error.h"
   6 #include "journal.h"
   7 #include "journal_io.h"
   8 #include "journal_reclaim.h"
   9 #include "replicas.h"
  10 #include "super.h"
  11
  12 #include <linux/kthread.h>
  13 #include <linux/sched/mm.h>
  14 #include <trace/events/bcachefs.h>
  15
  16 /* Free space calculations: */
  17
  18 static unsigned journal_space_from(struct journal_device *ja,
  19                                    enum journal_space_from from)
  20 {
  21         switch (from) {
  22         case journal_space_discarded:
  23                 return ja->discard_idx;
  24         case journal_space_clean_ondisk:
  25                 return ja->dirty_idx_ondisk;
  26         case journal_space_clean:
  27                 return ja->dirty_idx;
  28         default:
  29                 BUG();
  30         }
  31 }
  32
  33 unsigned bch2_journal_dev_buckets_available(struct journal *j,
  34                                             struct journal_device *ja,
  35                                             enum journal_space_from from)
  36 {
  37         unsigned available = (journal_space_from(ja, from) -
  38                               ja->cur_idx - 1 + ja->nr) % ja->nr;
  39
  40         /*
  41          * Don't use the last bucket unless writing the new last_seq
  42          * will make another bucket available:
  43          */
  44         if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
  45                 --available;
  46
  47         return available;
  48 }
  49
  50 static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
  51 {
  52         union journal_preres_state old, new;
  53         u64 v = atomic64_read(&j->prereserved.counter);
  54
  55         do {
  56                 old.v = new.v = v;
  57                 new.remaining = u64s_remaining;
  58         } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
  59                                        old.v, new.v)) != old.v);
  60 }
  61
  62 static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
  63 {
  64         unsigned sectors = 0;
  65
  66         while (!sectors && *idx != j->reservations.idx) {
  67                 sectors = j->buf[*idx].sectors;
  68
  69                 *idx = (*idx + 1) & JOURNAL_BUF_MASK;
  70         }
  71
  72         return sectors;
  73 }
  74
  75 static struct journal_space
  76 journal_dev_space_available(struct journal *j, struct bch_dev *ca,
  77                             enum journal_space_from from)
  78 {
  79         struct journal_device *ja = &ca->journal;
  80         unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
  81
  82         if (from == journal_space_total)
  83                 return (struct journal_space) {
  84                         .next_entry     = ca->mi.bucket_size,
  85                         .total          = ca->mi.bucket_size * ja->nr,
  86                 };
  87
  88         buckets = bch2_journal_dev_buckets_available(j, ja, from);
  89         sectors = ja->sectors_free;
  90
  91         /*
  92          * We that we don't allocate the space for a journal entry
  93          * until we write it out - thus, account for it here:
  94          */
  95         while ((unwritten = get_unwritten_sectors(j, &idx))) {
  96                 if (unwritten >= sectors) {
  97                         if (!buckets) {
  98                                 sectors = 0;
  99                                 break;
 100                         }
 101
 102                         buckets--;
 103                         sectors = ca->mi.bucket_size;
 104                 }
 105
 106                 sectors -= unwritten;
 107         }
 108
 109         if (sectors < ca->mi.bucket_size && buckets) {
 110                 buckets--;
 111                 sectors = ca->mi.bucket_size;
 112         }
 113
 114         return (struct journal_space) {
 115                 .next_entry     = sectors,
 116                 .total          = sectors + buckets * ca->mi.bucket_size,
 117         };
 118 }
 119
 120 static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
 121                             enum journal_space_from from)
 122 {
 123         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 124         struct bch_dev *ca;
 125         unsigned i, pos, nr_devs = 0;
 126         struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
 127
 128         BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
 129
 130         rcu_read_lock();
 131         for_each_member_device_rcu(ca, c, i,
 132                                    &c->rw_devs[BCH_DATA_journal]) {
 133                 if (!ca->journal.nr)
 134                         continue;
 135
 136                 space = journal_dev_space_available(j, ca, from);
 137                 if (!space.next_entry)
 138                         continue;
 139
 140                 for (pos = 0; pos < nr_devs; pos++)
 141                         if (space.total > dev_space[pos].total)
 142                                 break;
 143
 144                 array_insert_item(dev_space, nr_devs, pos, space);
 145         }
 146         rcu_read_unlock();
 147
 148         if (nr_devs < nr_devs_want)
 149                 return (struct journal_space) { 0, 0 };
 150
 151         /*
 152          * We sorted largest to smallest, and we want the smallest out of the
 153          * @nr_devs_want largest devices:
 154          */
 155         return dev_space[nr_devs_want - 1];
 156 }
 157
 158 void bch2_journal_space_available(struct journal *j)
 159 {
 160         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 161         struct bch_dev *ca;
 162         unsigned clean, clean_ondisk, total;
 163         s64 u64s_remaining = 0;
 164         unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
 165                                        j->buf[1].buf_size >> 9);
 166         unsigned i, nr_online = 0, nr_devs_want;
 167         bool can_discard = false;
 168         int ret = 0;
 169
 170         lockdep_assert_held(&j->lock);
 171
 172         rcu_read_lock();
 173         for_each_member_device_rcu(ca, c, i,
 174                                    &c->rw_devs[BCH_DATA_journal]) {
 175                 struct journal_device *ja = &ca->journal;
 176
 177                 if (!ja->nr)
 178                         continue;
 179
 180                 while (ja->dirty_idx != ja->cur_idx &&
 181                        ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
 182                         ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
 183
 184                 while (ja->dirty_idx_ondisk != ja->dirty_idx &&
 185                        ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
 186                         ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
 187
 188                 if (ja->discard_idx != ja->dirty_idx_ondisk)
 189                         can_discard = true;
 190
 191                 max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
 192                 nr_online++;
 193         }
 194         rcu_read_unlock();
 195
 196         j->can_discard = can_discard;
 197
 198         if (nr_online < c->opts.metadata_replicas_required) {
 199                 ret = cur_entry_insufficient_devices;
 200                 goto out;
 201         }
 202
 203         nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
 204
 205         for (i = 0; i < journal_space_nr; i++)
 206                 j->space[i] = __journal_space_available(j, nr_devs_want, i);
 207
 208         clean_ondisk    = j->space[journal_space_clean_ondisk].total;
 209         clean           = j->space[journal_space_clean].total;
 210         total           = j->space[journal_space_total].total;
 211
 212         if (!clean_ondisk &&
 213             j->reservations.idx ==
 214             j->reservations.unwritten_idx) {
 215                 char *buf = kmalloc(4096, GFP_ATOMIC);
 216
 217                 bch_err(c, "journal stuck");
 218                 if (buf) {
 219                         __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
 220                         pr_err("\n%s", buf);
 221                         kfree(buf);
 222                 }
 223
 224                 bch2_fatal_error(c);
 225                 ret = cur_entry_journal_stuck;
 226         } else if (!j->space[journal_space_discarded].next_entry)
 227                 ret = cur_entry_journal_full;
 228         else if (!fifo_free(&j->pin))
 229                 ret = cur_entry_journal_pin_full;
 230
 231         if ((j->space[journal_space_clean_ondisk].next_entry <
 232              j->space[journal_space_clean_ondisk].total) &&
 233             (clean - clean_ondisk <= total / 8) &&
 234             (clean_ondisk * 2 > clean ))
 235                 set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 236         else
 237                 clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 238
 239         u64s_remaining  = (u64) clean << 6;
 240         u64s_remaining -= (u64) total << 3;
 241         u64s_remaining = max(0LL, u64s_remaining);
 242         u64s_remaining /= 2;
 243         u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
 244 out:
 245         j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
 246         j->cur_entry_error      = ret;
 247         journal_set_remaining(j, u64s_remaining);
 248         journal_check_may_get_unreserved(j);
 249
 250         if (!ret)
 251                 journal_wake(j);
 252 }
 253
 254 /* Discards - last part of journal reclaim: */
 255
 256 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 257 {
 258         bool ret;
 259
 260         spin_lock(&j->lock);
 261         ret = ja->discard_idx != ja->dirty_idx_ondisk;
 262         spin_unlock(&j->lock);
 263
 264         return ret;
 265 }
 266
 267 /*
 268  * Advance ja->discard_idx as long as it points to buckets that are no longer
 269  * dirty, issuing discards if necessary:
 270  */
 271 void bch2_journal_do_discards(struct journal *j)
 272 {
 273         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 274         struct bch_dev *ca;
 275         unsigned iter;
 276
 277         mutex_lock(&j->discard_lock);
 278
 279         for_each_rw_member(ca, c, iter) {
 280                 struct journal_device *ja = &ca->journal;
 281
 282                 while (should_discard_bucket(j, ja)) {
 283                         if (ca->mi.discard &&
 284                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 285                                 blkdev_issue_discard(ca->disk_sb.bdev,
 286                                         bucket_to_sector(ca,
 287                                                 ja->buckets[ja->discard_idx]),
 288                                         ca->mi.bucket_size, GFP_NOIO, 0);
 289
 290                         spin_lock(&j->lock);
 291                         ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
 292
 293                         bch2_journal_space_available(j);
 294                         spin_unlock(&j->lock);
 295                 }
 296         }
 297
 298         mutex_unlock(&j->discard_lock);
 299 }
 300
 301 /*
 302  * Journal entry pinning - machinery for holding a reference on a given journal
 303  * entry, holding it open to ensure it gets replayed during recovery:
 304  */
 305
 306 static void bch2_journal_reclaim_fast(struct journal *j)
 307 {
 308         struct journal_entry_pin_list temp;
 309         bool popped = false;
 310
 311         lockdep_assert_held(&j->lock);
 312
 313         /*
 314          * Unpin journal entries whose reference counts reached zero, meaning
 315          * all btree nodes got written out
 316          */
 317         while (!fifo_empty(&j->pin) &&
 318                !atomic_read(&fifo_peek_front(&j->pin).count)) {
 319                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 320                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
 321                 BUG_ON(!fifo_pop(&j->pin, temp));
 322                 popped = true;
 323         }
 324
 325         if (popped)
 326                 bch2_journal_space_available(j);
 327 }
 328
 329 void __bch2_journal_pin_put(struct journal *j, u64 seq)
 330 {
 331         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 332
 333         if (atomic_dec_and_test(&pin_list->count))
 334                 bch2_journal_reclaim_fast(j);
 335 }
 336
 337 void bch2_journal_pin_put(struct journal *j, u64 seq)
 338 {
 339         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 340
 341         if (atomic_dec_and_test(&pin_list->count)) {
 342                 spin_lock(&j->lock);
 343                 bch2_journal_reclaim_fast(j);
 344                 spin_unlock(&j->lock);
 345         }
 346 }
 347
 348 static inline void __journal_pin_drop(struct journal *j,
 349                                       struct journal_entry_pin *pin)
 350 {
 351         struct journal_entry_pin_list *pin_list;
 352
 353         if (!journal_pin_active(pin))
 354                 return;
 355
 356         pin_list = journal_seq_pin(j, pin->seq);
 357         pin->seq = 0;
 358         list_del_init(&pin->list);
 359
 360         /*
 361          * Unpinning a journal entry make make journal_next_bucket() succeed, if
 362          * writing a new last_seq will now make another bucket available:
 363          */
 364         if (atomic_dec_and_test(&pin_list->count) &&
 365             pin_list == &fifo_peek_front(&j->pin))
 366                 bch2_journal_reclaim_fast(j);
 367         else if (fifo_used(&j->pin) == 1 &&
 368                  atomic_read(&pin_list->count) == 1)
 369                 journal_wake(j);
 370 }
 371
 372 void bch2_journal_pin_drop(struct journal *j,
 373                            struct journal_entry_pin *pin)
 374 {
 375         spin_lock(&j->lock);
 376         __journal_pin_drop(j, pin);
 377         spin_unlock(&j->lock);
 378 }
 379
 380 void bch2_journal_pin_set(struct journal *j, u64 seq,
 381                           struct journal_entry_pin *pin,
 382                           journal_pin_flush_fn flush_fn)
 383 {
 384         struct journal_entry_pin_list *pin_list;
 385
 386         spin_lock(&j->lock);
 387
 388         if (seq < journal_last_seq(j)) {
 389                 /*
 390                  * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
 391                  * the src pin - with the pin dropped, the entry to pin might no
 392                  * longer to exist, but that means there's no longer anything to
 393                  * copy and we can bail out here:
 394                  */
 395                 spin_unlock(&j->lock);
 396                 return;
 397         }
 398
 399         pin_list = journal_seq_pin(j, seq);
 400
 401         __journal_pin_drop(j, pin);
 402
 403         atomic_inc(&pin_list->count);
 404         pin->seq        = seq;
 405         pin->flush      = flush_fn;
 406
 407         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 408         spin_unlock(&j->lock);
 409
 410         /*
 411          * If the journal is currently full,  we might want to call flush_fn
 412          * immediately:
 413          */
 414         journal_wake(j);
 415 }
 416
 417 /**
 418  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
 419  */
 420 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 421 {
 422         BUG_ON(journal_pin_active(pin));
 423
 424         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
 425 }
 426
 427 /*
 428  * Journal reclaim: flush references to open journal entries to reclaim space in
 429  * the journal
 430  *
 431  * May be done by the journal code in the background as needed to free up space
 432  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 433  * data off of a specific device:
 434  */
 435
 436 static struct journal_entry_pin *
 437 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 438 {
 439         struct journal_entry_pin_list *pin_list;
 440         struct journal_entry_pin *ret = NULL;
 441
 442         if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
 443                 return NULL;
 444
 445         spin_lock(&j->lock);
 446
 447         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 448                 if (*seq > max_seq ||
 449                     (ret = list_first_entry_or_null(&pin_list->list,
 450                                 struct journal_entry_pin, list)))
 451                         break;
 452
 453         if (ret) {
 454                 list_move(&ret->list, &pin_list->flushed);
 455                 BUG_ON(j->flush_in_progress);
 456                 j->flush_in_progress = ret;
 457         }
 458
 459         spin_unlock(&j->lock);
 460
 461         return ret;
 462 }
 463
 464 /* returns true if we did work */
 465 static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
 466                               unsigned min_nr)
 467 {
 468         struct journal_entry_pin *pin;
 469         u64 seq, ret = 0;
 470
 471         lockdep_assert_held(&j->reclaim_lock);
 472
 473         while (1) {
 474                 cond_resched();
 475
 476                 j->last_flushed = jiffies;
 477
 478                 pin = journal_get_next_pin(j, min_nr
 479                                 ? U64_MAX : seq_to_flush, &seq);
 480                 if (!pin)
 481                         break;
 482
 483                 if (min_nr)
 484                         min_nr--;
 485
 486                 pin->flush(j, pin, seq);
 487
 488                 BUG_ON(j->flush_in_progress != pin);
 489                 j->flush_in_progress = NULL;
 490                 wake_up(&j->pin_flush_wait);
 491                 ret++;
 492         }
 493
 494         return ret;
 495 }
 496
 497 static u64 journal_seq_to_flush(struct journal *j)
 498 {
 499         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 500         struct bch_dev *ca;
 501         u64 seq_to_flush = 0;
 502         unsigned iter;
 503
 504         spin_lock(&j->lock);
 505
 506         for_each_rw_member(ca, c, iter) {
 507                 struct journal_device *ja = &ca->journal;
 508                 unsigned nr_buckets, bucket_to_flush;
 509
 510                 if (!ja->nr)
 511                         continue;
 512
 513                 /* Try to keep the journal at most half full: */
 514                 nr_buckets = ja->nr / 2;
 515
 516                 /* And include pre-reservations: */
 517                 nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
 518                                            (ca->mi.bucket_size << 6) -
 519                                            journal_entry_overhead(j));
 520
 521                 nr_buckets = min(nr_buckets, ja->nr);
 522
 523                 bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
 524                 seq_to_flush = max(seq_to_flush,
 525                                    ja->bucket_seq[bucket_to_flush]);
 526         }
 527
 528         /* Also flush if the pin fifo is more than half full */
 529         seq_to_flush = max_t(s64, seq_to_flush,
 530                              (s64) journal_cur_seq(j) -
 531                              (j->pin.size >> 1));
 532         spin_unlock(&j->lock);
 533
 534         return seq_to_flush;
 535 }
 536
 537 /**
 538  * bch2_journal_reclaim - free up journal buckets
 539  *
 540  * Background journal reclaim writes out btree nodes. It should be run
 541  * early enough so that we never completely run out of journal buckets.
 542  *
 543  * High watermarks for triggering background reclaim:
 544  * - FIFO has fewer than 512 entries left
 545  * - fewer than 25% journal buckets free
 546  *
 547  * Background reclaim runs until low watermarks are reached:
 548  * - FIFO has more than 1024 entries left
 549  * - more than 50% journal buckets free
 550  *
 551  * As long as a reclaim can complete in the time it takes to fill up
 552  * 512 journal entries or 25% of all journal buckets, then
 553  * journal_next_bucket() should not stall.
 554  */
 555 static int __bch2_journal_reclaim(struct journal *j, bool direct)
 556 {
 557         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 558         bool kthread = (current->flags & PF_KTHREAD) != 0;
 559         u64 seq_to_flush, nr_flushed = 0;
 560         size_t min_nr;
 561         unsigned flags;
 562         int ret = 0;
 563
 564         /*
 565          * We can't invoke memory reclaim while holding the reclaim_lock -
 566          * journal reclaim is required to make progress for memory reclaim
 567          * (cleaning the caches), so we can't get stuck in memory reclaim while
 568          * we're holding the reclaim lock:
 569          */
 570         lockdep_assert_held(&j->reclaim_lock);
 571         flags = memalloc_noreclaim_save();
 572
 573         do {
 574                 if (kthread && kthread_should_stop())
 575                         break;
 576
 577                 if (bch2_journal_error(j)) {
 578                         ret = -EIO;
 579                         break;
 580                 }
 581
 582                 bch2_journal_do_discards(j);
 583
 584                 seq_to_flush = journal_seq_to_flush(j);
 585                 min_nr = 0;
 586
 587                 /*
 588                  * If it's been longer than j->reclaim_delay_ms since we last flushed,
 589                  * make sure to flush at least one journal pin:
 590                  */
 591                 if (time_after(jiffies, j->last_flushed +
 592                                msecs_to_jiffies(j->reclaim_delay_ms)))
 593                         min_nr = 1;
 594
 595                 if (j->prereserved.reserved * 2 > j->prereserved.remaining)
 596                         min_nr = 1;
 597
 598                 if (atomic_read(&c->btree_cache.dirty) * 4 >
 599                     c->btree_cache.used  * 3)
 600                         min_nr = 1;
 601
 602                 if (fifo_free(&j->pin) <= 32)
 603                         min_nr = 1;
 604
 605                 min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c));
 606
 607                 trace_journal_reclaim_start(c,
 608                                 min_nr,
 609                                 j->prereserved.reserved,
 610                                 j->prereserved.remaining,
 611                                 atomic_read(&c->btree_cache.dirty),
 612                                 c->btree_cache.used,
 613                                 atomic_long_read(&c->btree_key_cache.nr_dirty),
 614                                 atomic_long_read(&c->btree_key_cache.nr_keys));
 615
 616                 nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr);
 617
 618                 if (direct)
 619                         j->nr_direct_reclaim += nr_flushed;
 620                 else
 621                         j->nr_background_reclaim += nr_flushed;
 622                 trace_journal_reclaim_finish(c, nr_flushed);
 623         } while (min_nr && nr_flushed);
 624
 625         memalloc_noreclaim_restore(flags);
 626
 627         return ret;
 628 }
 629
 630 int bch2_journal_reclaim(struct journal *j)
 631 {
 632         return __bch2_journal_reclaim(j, true);
 633 }
 634
 635 static int bch2_journal_reclaim_thread(void *arg)
 636 {
 637         struct journal *j = arg;
 638         unsigned long next;
 639         int ret = 0;
 640
 641         set_freezable();
 642
 643         kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
 644
 645         while (!ret && !kthread_should_stop()) {
 646                 j->reclaim_kicked = false;
 647
 648                 mutex_lock(&j->reclaim_lock);
 649                 ret = __bch2_journal_reclaim(j, false);
 650                 mutex_unlock(&j->reclaim_lock);
 651
 652                 next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
 653
 654                 while (1) {
 655                         set_current_state(TASK_INTERRUPTIBLE);
 656                         if (kthread_should_stop())
 657                                 break;
 658                         if (j->reclaim_kicked)
 659                                 break;
 660                         if (time_after_eq(jiffies, next))
 661                                 break;
 662                         schedule_timeout(next - jiffies);
 663                         try_to_freeze();
 664
 665                 }
 666                 __set_current_state(TASK_RUNNING);
 667         }
 668
 669         return 0;
 670 }
 671
 672 void bch2_journal_reclaim_stop(struct journal *j)
 673 {
 674         struct task_struct *p = j->reclaim_thread;
 675
 676         j->reclaim_thread = NULL;
 677
 678         if (p) {
 679                 kthread_stop(p);
 680                 put_task_struct(p);
 681         }
 682 }
 683
 684 int bch2_journal_reclaim_start(struct journal *j)
 685 {
 686         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 687         struct task_struct *p;
 688
 689         if (j->reclaim_thread)
 690                 return 0;
 691
 692         p = kthread_create(bch2_journal_reclaim_thread, j,
 693                            "bch-reclaim/%s", c->name);
 694         if (IS_ERR(p)) {
 695                 bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
 696                 return PTR_ERR(p);
 697         }
 698
 699         get_task_struct(p);
 700         j->reclaim_thread = p;
 701         wake_up_process(p);
 702         return 0;
 703 }
 704
 705 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 706                               bool *did_work)
 707 {
 708         int ret;
 709
 710         ret = bch2_journal_error(j);
 711         if (ret)
 712                 return ret;
 713
 714         mutex_lock(&j->reclaim_lock);
 715
 716         *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0;
 717
 718         spin_lock(&j->lock);
 719         /*
 720          * If journal replay hasn't completed, the unreplayed journal entries
 721          * hold refs on their corresponding sequence numbers
 722          */
 723         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 724                 journal_last_seq(j) > seq_to_flush ||
 725                 (fifo_used(&j->pin) == 1 &&
 726                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 727
 728         spin_unlock(&j->lock);
 729         mutex_unlock(&j->reclaim_lock);
 730
 731         return ret;
 732 }
 733
 734 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 735 {
 736         bool did_work = false;
 737
 738         if (!test_bit(JOURNAL_STARTED, &j->flags))
 739                 return false;
 740
 741         closure_wait_event(&j->async_wait,
 742                 journal_flush_done(j, seq_to_flush, &did_work));
 743
 744         return did_work;
 745 }
 746
 747 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 748 {
 749         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 750         struct journal_entry_pin_list *p;
 751         u64 iter, seq = 0;
 752         int ret = 0;
 753
 754         spin_lock(&j->lock);
 755         fifo_for_each_entry_ptr(p, &j->pin, iter)
 756                 if (dev_idx >= 0
 757                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 758                     : p->devs.nr < c->opts.metadata_replicas)
 759                         seq = iter;
 760         spin_unlock(&j->lock);
 761
 762         bch2_journal_flush_pins(j, seq);
 763
 764         ret = bch2_journal_error(j);
 765         if (ret)
 766                 return ret;
 767
 768         mutex_lock(&c->replicas_gc_lock);
 769         bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
 770
 771         seq = 0;
 772
 773         spin_lock(&j->lock);
 774         while (!ret && seq < j->pin.back) {
 775                 struct bch_replicas_padded replicas;
 776
 777                 seq = max(seq, journal_last_seq(j));
 778                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
 779                                          journal_seq_pin(j, seq)->devs);
 780                 seq++;
 781
 782                 spin_unlock(&j->lock);
 783                 ret = bch2_mark_replicas(c, &replicas.e);
 784                 spin_lock(&j->lock);
 785         }
 786         spin_unlock(&j->lock);
 787
 788         ret = bch2_replicas_gc_end(c, ret);
 789         mutex_unlock(&c->replicas_gc_lock);
 790
 791         return ret;
 792 }