git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1
   2 #include "bcachefs.h"
   3 #include "journal.h"
   4 #include "journal_io.h"
   5 #include "journal_reclaim.h"
   6 #include "replicas.h"
   7 #include "super.h"
   8
   9 /* Free space calculations: */
  10
  11 unsigned bch2_journal_dev_buckets_available(struct journal *j,
  12                                             struct journal_device *ja)
  13 {
  14         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  15         unsigned next = (ja->cur_idx + 1) % ja->nr;
  16         unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
  17
  18         /*
  19          * Allocator startup needs some journal space before we can do journal
  20          * replay:
  21          */
  22         if (available &&
  23             test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
  24                 available--;
  25
  26         /*
  27          * Don't use the last bucket unless writing the new last_seq
  28          * will make another bucket available:
  29          */
  30         if (available &&
  31             journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
  32                 --available;
  33
  34         return available;
  35 }
  36
  37 void bch2_journal_space_available(struct journal *j)
  38 {
  39         struct bch_fs *c = container_of(j, struct bch_fs, journal);
  40         struct bch_dev *ca;
  41         unsigned sectors_next_entry     = UINT_MAX;
  42         unsigned sectors_total          = UINT_MAX;
  43         unsigned max_entry_size         = min(j->buf[0].buf_size >> 9,
  44                                               j->buf[1].buf_size >> 9);
  45         unsigned i, nr_online = 0, nr_devs = 0;
  46         unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
  47                 ? journal_prev_buf(j)->sectors
  48                 : 0;
  49         int ret = 0;
  50
  51         lockdep_assert_held(&j->lock);
  52
  53         rcu_read_lock();
  54         for_each_member_device_rcu(ca, c, i,
  55                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
  56                 struct journal_device *ja = &ca->journal;
  57                 unsigned buckets_this_device, sectors_this_device;
  58
  59                 if (!ja->nr)
  60                         continue;
  61
  62                 nr_online++;
  63
  64                 buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
  65                 sectors_this_device = ja->sectors_free;
  66
  67                 /*
  68                  * We that we don't allocate the space for a journal entry
  69                  * until we write it out - thus, account for it here:
  70                  */
  71                 if (unwritten_sectors >= sectors_this_device) {
  72                         if (!buckets_this_device)
  73                                 continue;
  74
  75                         buckets_this_device--;
  76                         sectors_this_device = ca->mi.bucket_size;
  77                 }
  78
  79                 sectors_this_device -= unwritten_sectors;
  80
  81                 if (sectors_this_device < ca->mi.bucket_size &&
  82                     buckets_this_device) {
  83                         buckets_this_device--;
  84                         sectors_this_device = ca->mi.bucket_size;
  85                 }
  86
  87                 if (!sectors_this_device)
  88                         continue;
  89
  90                 sectors_next_entry = min(sectors_next_entry,
  91                                          sectors_this_device);
  92
  93                 sectors_total = min(sectors_total,
  94                         buckets_this_device * ca->mi.bucket_size +
  95                         sectors_this_device);
  96
  97                 max_entry_size = min_t(unsigned, max_entry_size,
  98                                        ca->mi.bucket_size);
  99
 100                 nr_devs++;
 101         }
 102         rcu_read_unlock();
 103
 104         if (nr_online < c->opts.metadata_replicas_required) {
 105                 ret = -EROFS;
 106                 sectors_next_entry = 0;
 107         } else if (!sectors_next_entry ||
 108                    nr_devs < min_t(unsigned, nr_online,
 109                                    c->opts.metadata_replicas)) {
 110                 ret = -ENOSPC;
 111                 sectors_next_entry = 0;
 112         } else if (!fifo_free(&j->pin)) {
 113                 ret = -ENOSPC;
 114                 sectors_next_entry = 0;
 115         }
 116
 117         j->cur_entry_sectors    = sectors_next_entry;
 118         j->cur_entry_error      = ret;
 119
 120         if (!ret)
 121                 journal_wake(j);
 122 }
 123
 124 /* Discards - last part of journal reclaim: */
 125
 126 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 127 {
 128         bool ret;
 129
 130         spin_lock(&j->lock);
 131         ret = ja->nr &&
 132                 ja->last_idx != ja->cur_idx &&
 133                 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
 134         spin_unlock(&j->lock);
 135
 136         return ret;
 137 }
 138
 139 /*
 140  * Advance ja->last_idx as long as it points to buckets that are no longer
 141  * dirty, issuing discards if necessary:
 142  */
 143 static void journal_do_discards(struct journal *j)
 144 {
 145         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 146         struct bch_dev *ca;
 147         unsigned iter;
 148
 149         mutex_lock(&j->reclaim_lock);
 150
 151         for_each_rw_member(ca, c, iter) {
 152                 struct journal_device *ja = &ca->journal;
 153
 154                 while (should_discard_bucket(j, ja)) {
 155                         if (ca->mi.discard &&
 156                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 157                                 blkdev_issue_discard(ca->disk_sb.bdev,
 158                                         bucket_to_sector(ca,
 159                                                 ja->buckets[ja->last_idx]),
 160                                         ca->mi.bucket_size, GFP_NOIO, 0);
 161
 162                         spin_lock(&j->lock);
 163                         ja->last_idx = (ja->last_idx + 1) % ja->nr;
 164
 165                         bch2_journal_space_available(j);
 166                         spin_unlock(&j->lock);
 167                 }
 168         }
 169
 170         mutex_unlock(&j->reclaim_lock);
 171 }
 172
 173 /*
 174  * Journal entry pinning - machinery for holding a reference on a given journal
 175  * entry, holding it open to ensure it gets replayed during recovery:
 176  */
 177
 178 static void bch2_journal_reclaim_fast(struct journal *j)
 179 {
 180         struct journal_entry_pin_list temp;
 181         bool popped = false;
 182
 183         lockdep_assert_held(&j->lock);
 184
 185         /*
 186          * Unpin journal entries whose reference counts reached zero, meaning
 187          * all btree nodes got written out
 188          */
 189         while (!fifo_empty(&j->pin) &&
 190                !atomic_read(&fifo_peek_front(&j->pin).count)) {
 191                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 192                 BUG_ON(!fifo_pop(&j->pin, temp));
 193                 popped = true;
 194         }
 195
 196         if (popped)
 197                 bch2_journal_space_available(j);
 198 }
 199
 200 void bch2_journal_pin_put(struct journal *j, u64 seq)
 201 {
 202         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 203
 204         if (atomic_dec_and_test(&pin_list->count)) {
 205                 spin_lock(&j->lock);
 206                 bch2_journal_reclaim_fast(j);
 207                 spin_unlock(&j->lock);
 208         }
 209 }
 210
 211 static inline void __journal_pin_add(struct journal *j,
 212                                      u64 seq,
 213                                      struct journal_entry_pin *pin,
 214                                      journal_pin_flush_fn flush_fn)
 215 {
 216         struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 217
 218         BUG_ON(journal_pin_active(pin));
 219         BUG_ON(!atomic_read(&pin_list->count));
 220
 221         atomic_inc(&pin_list->count);
 222         pin->seq        = seq;
 223         pin->flush      = flush_fn;
 224
 225         list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 226
 227         /*
 228          * If the journal is currently full,  we might want to call flush_fn
 229          * immediately:
 230          */
 231         journal_wake(j);
 232 }
 233
 234 void bch2_journal_pin_add(struct journal *j, u64 seq,
 235                           struct journal_entry_pin *pin,
 236                           journal_pin_flush_fn flush_fn)
 237 {
 238         spin_lock(&j->lock);
 239         __journal_pin_add(j, seq, pin, flush_fn);
 240         spin_unlock(&j->lock);
 241 }
 242
 243 static inline void __journal_pin_drop(struct journal *j,
 244                                       struct journal_entry_pin *pin)
 245 {
 246         struct journal_entry_pin_list *pin_list;
 247
 248         if (!journal_pin_active(pin))
 249                 return;
 250
 251         pin_list = journal_seq_pin(j, pin->seq);
 252         pin->seq = 0;
 253         list_del_init(&pin->list);
 254
 255         /*
 256          * Unpinning a journal entry make make journal_next_bucket() succeed, if
 257          * writing a new last_seq will now make another bucket available:
 258          */
 259         if (atomic_dec_and_test(&pin_list->count) &&
 260             pin_list == &fifo_peek_front(&j->pin))
 261                 bch2_journal_reclaim_fast(j);
 262         else if (fifo_used(&j->pin) == 1 &&
 263                  atomic_read(&pin_list->count) == 1)
 264                 journal_wake(j);
 265 }
 266
 267 void bch2_journal_pin_drop(struct journal *j,
 268                            struct journal_entry_pin *pin)
 269 {
 270         spin_lock(&j->lock);
 271         __journal_pin_drop(j, pin);
 272         spin_unlock(&j->lock);
 273 }
 274
 275 void bch2_journal_pin_update(struct journal *j, u64 seq,
 276                              struct journal_entry_pin *pin,
 277                              journal_pin_flush_fn flush_fn)
 278 {
 279         spin_lock(&j->lock);
 280
 281         if (pin->seq != seq) {
 282                 __journal_pin_drop(j, pin);
 283                 __journal_pin_add(j, seq, pin, flush_fn);
 284         } else {
 285                 struct journal_entry_pin_list *pin_list =
 286                         journal_seq_pin(j, seq);
 287
 288                 list_move(&pin->list, &pin_list->list);
 289         }
 290
 291         spin_unlock(&j->lock);
 292 }
 293
 294 void bch2_journal_pin_add_if_older(struct journal *j,
 295                                   struct journal_entry_pin *src_pin,
 296                                   struct journal_entry_pin *pin,
 297                                   journal_pin_flush_fn flush_fn)
 298 {
 299         spin_lock(&j->lock);
 300
 301         if (journal_pin_active(src_pin) &&
 302             (!journal_pin_active(pin) ||
 303              src_pin->seq < pin->seq)) {
 304                 __journal_pin_drop(j, pin);
 305                 __journal_pin_add(j, src_pin->seq, pin, flush_fn);
 306         }
 307
 308         spin_unlock(&j->lock);
 309 }
 310
 311 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 312 {
 313         BUG_ON(journal_pin_active(pin));
 314
 315         wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
 316 }
 317
 318 /*
 319  * Journal reclaim: flush references to open journal entries to reclaim space in
 320  * the journal
 321  *
 322  * May be done by the journal code in the background as needed to free up space
 323  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 324  * data off of a specific device:
 325  */
 326
 327 static struct journal_entry_pin *
 328 journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 329 {
 330         struct journal_entry_pin_list *pin_list;
 331         struct journal_entry_pin *ret = NULL;
 332
 333         spin_lock(&j->lock);
 334
 335         fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
 336                 if (*seq > max_seq ||
 337                     (ret = list_first_entry_or_null(&pin_list->list,
 338                                 struct journal_entry_pin, list)))
 339                         break;
 340
 341         if (ret) {
 342                 list_move(&ret->list, &pin_list->flushed);
 343                 BUG_ON(j->flush_in_progress);
 344                 j->flush_in_progress = ret;
 345                 j->last_flushed = jiffies;
 346         }
 347
 348         spin_unlock(&j->lock);
 349
 350         return ret;
 351 }
 352
 353 static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
 354                                unsigned min_nr)
 355 {
 356         struct journal_entry_pin *pin;
 357         u64 seq;
 358
 359         lockdep_assert_held(&j->reclaim_lock);
 360
 361         while ((pin = journal_get_next_pin(j, min_nr
 362                                 ? U64_MAX : seq_to_flush, &seq))) {
 363                 if (min_nr)
 364                         min_nr--;
 365
 366                 pin->flush(j, pin, seq);
 367
 368                 BUG_ON(j->flush_in_progress != pin);
 369                 j->flush_in_progress = NULL;
 370                 wake_up(&j->pin_flush_wait);
 371         }
 372 }
 373
 374 /**
 375  * bch2_journal_reclaim_work - free up journal buckets
 376  *
 377  * Background journal reclaim writes out btree nodes. It should be run
 378  * early enough so that we never completely run out of journal buckets.
 379  *
 380  * High watermarks for triggering background reclaim:
 381  * - FIFO has fewer than 512 entries left
 382  * - fewer than 25% journal buckets free
 383  *
 384  * Background reclaim runs until low watermarks are reached:
 385  * - FIFO has more than 1024 entries left
 386  * - more than 50% journal buckets free
 387  *
 388  * As long as a reclaim can complete in the time it takes to fill up
 389  * 512 journal entries or 25% of all journal buckets, then
 390  * journal_next_bucket() should not stall.
 391  */
 392 void bch2_journal_reclaim_work(struct work_struct *work)
 393 {
 394         struct bch_fs *c = container_of(to_delayed_work(work),
 395                                 struct bch_fs, journal.reclaim_work);
 396         struct journal *j = &c->journal;
 397         struct bch_dev *ca;
 398         unsigned iter, bucket_to_flush, min_nr = 0;
 399         u64 seq_to_flush = 0;
 400
 401         journal_do_discards(j);
 402
 403         mutex_lock(&j->reclaim_lock);
 404         spin_lock(&j->lock);
 405
 406         for_each_rw_member(ca, c, iter) {
 407                 struct journal_device *ja = &ca->journal;
 408
 409                 if (!ja->nr)
 410                         continue;
 411
 412
 413                 /* Try to keep the journal at most half full: */
 414                 bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
 415                 seq_to_flush = max_t(u64, seq_to_flush,
 416                                      ja->bucket_seq[bucket_to_flush]);
 417         }
 418
 419         /* Also flush if the pin fifo is more than half full */
 420         seq_to_flush = max_t(s64, seq_to_flush,
 421                              (s64) journal_cur_seq(j) -
 422                              (j->pin.size >> 1));
 423         spin_unlock(&j->lock);
 424
 425         /*
 426          * If it's been longer than j->reclaim_delay_ms since we last flushed,
 427          * make sure to flush at least one journal pin:
 428          */
 429         if (time_after(jiffies, j->last_flushed +
 430                        msecs_to_jiffies(j->reclaim_delay_ms)))
 431                 min_nr = 1;
 432
 433         journal_flush_pins(j, seq_to_flush, min_nr);
 434
 435         mutex_unlock(&j->reclaim_lock);
 436
 437         if (!test_bit(BCH_FS_RO, &c->flags))
 438                 queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
 439                                    msecs_to_jiffies(j->reclaim_delay_ms));
 440 }
 441
 442 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 443 {
 444         int ret;
 445
 446         ret = bch2_journal_error(j);
 447         if (ret)
 448                 return ret;
 449
 450         mutex_lock(&j->reclaim_lock);
 451
 452         journal_flush_pins(j, seq_to_flush, 0);
 453
 454         spin_lock(&j->lock);
 455         /*
 456          * If journal replay hasn't completed, the unreplayed journal entries
 457          * hold refs on their corresponding sequence numbers
 458          */
 459         ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 460                 journal_last_seq(j) > seq_to_flush ||
 461                 (fifo_used(&j->pin) == 1 &&
 462                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 463
 464         spin_unlock(&j->lock);
 465         mutex_unlock(&j->reclaim_lock);
 466
 467         return ret;
 468 }
 469
 470 void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 471 {
 472         if (!test_bit(JOURNAL_STARTED, &j->flags))
 473                 return;
 474
 475         closure_wait_event(&j->async_wait, journal_flush_done(j, seq_to_flush));
 476 }
 477
 478 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 479 {
 480         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 481         struct journal_entry_pin_list *p;
 482         u64 iter, seq = 0;
 483         int ret = 0;
 484
 485         spin_lock(&j->lock);
 486         fifo_for_each_entry_ptr(p, &j->pin, iter)
 487                 if (dev_idx >= 0
 488                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 489                     : p->devs.nr < c->opts.metadata_replicas)
 490                         seq = iter;
 491         spin_unlock(&j->lock);
 492
 493         bch2_journal_flush_pins(j, seq);
 494
 495         ret = bch2_journal_error(j);
 496         if (ret)
 497                 return ret;
 498
 499         mutex_lock(&c->replicas_gc_lock);
 500         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
 501
 502         seq = 0;
 503
 504         spin_lock(&j->lock);
 505         while (!ret && seq < j->pin.back) {
 506                 struct bch_replicas_padded replicas;
 507
 508                 seq = max(seq, journal_last_seq(j));
 509                 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
 510                                          journal_seq_pin(j, seq)->devs);
 511                 seq++;
 512
 513                 spin_unlock(&j->lock);
 514                 ret = bch2_mark_replicas(c, &replicas.e);
 515                 spin_lock(&j->lock);
 516         }
 517         spin_unlock(&j->lock);
 518
 519         ret = bch2_replicas_gc_end(c, ret);
 520         mutex_unlock(&c->replicas_gc_lock);
 521
 522         return ret;
 523 }