git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal_reclaim.c

   1
   2 #include "bcachefs.h"
   3 #include "journal.h"
   4 #include "journal_reclaim.h"
   5 #include "replicas.h"
   6 #include "super.h"
   7
   8 /*
   9  * Journal entry pinning - machinery for holding a reference on a given journal
  10  * entry, holding it open to ensure it gets replayed during recovery:
  11  */
  12
  13 static inline u64 journal_pin_seq(struct journal *j,
  14                                   struct journal_entry_pin_list *pin_list)
  15 {
  16         return fifo_entry_idx_abs(&j->pin, pin_list);
  17 }
  18
  19 u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
  20 {
  21         u64 ret = 0;
  22
  23         spin_lock(&j->lock);
  24         if (journal_pin_active(pin))
  25                 ret = journal_pin_seq(j, pin->pin_list);
  26         spin_unlock(&j->lock);
  27
  28         return ret;
  29 }
  30
  31 static inline void __journal_pin_add(struct journal *j,
  32                                      struct journal_entry_pin_list *pin_list,
  33                                      struct journal_entry_pin *pin,
  34                                      journal_pin_flush_fn flush_fn)
  35 {
  36         BUG_ON(journal_pin_active(pin));
  37         BUG_ON(!atomic_read(&pin_list->count));
  38
  39         atomic_inc(&pin_list->count);
  40         pin->pin_list   = pin_list;
  41         pin->flush      = flush_fn;
  42
  43         if (flush_fn)
  44                 list_add(&pin->list, &pin_list->list);
  45         else
  46                 INIT_LIST_HEAD(&pin->list);
  47
  48         /*
  49          * If the journal is currently full,  we might want to call flush_fn
  50          * immediately:
  51          */
  52         journal_wake(j);
  53 }
  54
  55 void bch2_journal_pin_add(struct journal *j, u64 seq,
  56                           struct journal_entry_pin *pin,
  57                           journal_pin_flush_fn flush_fn)
  58 {
  59         spin_lock(&j->lock);
  60         __journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
  61         spin_unlock(&j->lock);
  62 }
  63
  64 static inline void __journal_pin_drop(struct journal *j,
  65                                       struct journal_entry_pin *pin)
  66 {
  67         struct journal_entry_pin_list *pin_list = pin->pin_list;
  68
  69         if (!journal_pin_active(pin))
  70                 return;
  71
  72         pin->pin_list = NULL;
  73         list_del_init(&pin->list);
  74
  75         /*
  76          * Unpinning a journal entry make make journal_next_bucket() succeed, if
  77          * writing a new last_seq will now make another bucket available:
  78          */
  79         if (atomic_dec_and_test(&pin_list->count) &&
  80             pin_list == &fifo_peek_front(&j->pin))
  81                 bch2_journal_reclaim_fast(j);
  82 }
  83
  84 void bch2_journal_pin_drop(struct journal *j,
  85                           struct journal_entry_pin *pin)
  86 {
  87         spin_lock(&j->lock);
  88         __journal_pin_drop(j, pin);
  89         spin_unlock(&j->lock);
  90 }
  91
  92 void bch2_journal_pin_add_if_older(struct journal *j,
  93                                   struct journal_entry_pin *src_pin,
  94                                   struct journal_entry_pin *pin,
  95                                   journal_pin_flush_fn flush_fn)
  96 {
  97         spin_lock(&j->lock);
  98
  99         if (journal_pin_active(src_pin) &&
 100             (!journal_pin_active(pin) ||
 101              journal_pin_seq(j, src_pin->pin_list) <
 102              journal_pin_seq(j, pin->pin_list))) {
 103                 __journal_pin_drop(j, pin);
 104                 __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
 105         }
 106
 107         spin_unlock(&j->lock);
 108 }
 109
 110 /*
 111  * Journal reclaim: flush references to open journal entries to reclaim space in
 112  * the journal
 113  *
 114  * May be done by the journal code in the background as needed to free up space
 115  * for more journal entries, or as part of doing a clean shutdown, or to migrate
 116  * data off of a specific device:
 117  */
 118
 119 /**
 120  * bch2_journal_reclaim_fast - do the fast part of journal reclaim
 121  *
 122  * Called from IO submission context, does not block. Cleans up after btree
 123  * write completions by advancing the journal pin and each cache's last_idx,
 124  * kicking off discards and background reclaim as necessary.
 125  */
 126 void bch2_journal_reclaim_fast(struct journal *j)
 127 {
 128         struct journal_entry_pin_list temp;
 129         bool popped = false;
 130
 131         lockdep_assert_held(&j->lock);
 132
 133         /*
 134          * Unpin journal entries whose reference counts reached zero, meaning
 135          * all btree nodes got written out
 136          */
 137         while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
 138                 BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
 139                 BUG_ON(!fifo_pop(&j->pin, temp));
 140                 popped = true;
 141         }
 142
 143         if (popped)
 144                 journal_wake(j);
 145 }
 146
 147 static struct journal_entry_pin *
 148 __journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 149 {
 150         struct journal_entry_pin_list *pin_list;
 151         struct journal_entry_pin *ret;
 152         u64 iter;
 153
 154         /* no need to iterate over empty fifo entries: */
 155         bch2_journal_reclaim_fast(j);
 156
 157         fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
 158                 if (iter > seq_to_flush)
 159                         break;
 160
 161                 ret = list_first_entry_or_null(&pin_list->list,
 162                                 struct journal_entry_pin, list);
 163                 if (ret) {
 164                         /* must be list_del_init(), see bch2_journal_pin_drop() */
 165                         list_move(&ret->list, &pin_list->flushed);
 166                         *seq = iter;
 167                         return ret;
 168                 }
 169         }
 170
 171         return NULL;
 172 }
 173
 174 static struct journal_entry_pin *
 175 journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
 176 {
 177         struct journal_entry_pin *ret;
 178
 179         spin_lock(&j->lock);
 180         ret = __journal_get_next_pin(j, seq_to_flush, seq);
 181         spin_unlock(&j->lock);
 182
 183         return ret;
 184 }
 185
 186 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 187 {
 188         bool ret;
 189
 190         spin_lock(&j->lock);
 191         ret = ja->nr &&
 192                 (ja->last_idx != ja->cur_idx &&
 193                  ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
 194         spin_unlock(&j->lock);
 195
 196         return ret;
 197 }
 198
 199 /**
 200  * bch2_journal_reclaim_work - free up journal buckets
 201  *
 202  * Background journal reclaim writes out btree nodes. It should be run
 203  * early enough so that we never completely run out of journal buckets.
 204  *
 205  * High watermarks for triggering background reclaim:
 206  * - FIFO has fewer than 512 entries left
 207  * - fewer than 25% journal buckets free
 208  *
 209  * Background reclaim runs until low watermarks are reached:
 210  * - FIFO has more than 1024 entries left
 211  * - more than 50% journal buckets free
 212  *
 213  * As long as a reclaim can complete in the time it takes to fill up
 214  * 512 journal entries or 25% of all journal buckets, then
 215  * journal_next_bucket() should not stall.
 216  */
 217 void bch2_journal_reclaim_work(struct work_struct *work)
 218 {
 219         struct bch_fs *c = container_of(to_delayed_work(work),
 220                                 struct bch_fs, journal.reclaim_work);
 221         struct journal *j = &c->journal;
 222         struct bch_dev *ca;
 223         struct journal_entry_pin *pin;
 224         u64 seq, seq_to_flush = 0;
 225         unsigned iter, bucket_to_flush;
 226         unsigned long next_flush;
 227         bool reclaim_lock_held = false, need_flush;
 228
 229         /*
 230          * Advance last_idx to point to the oldest journal entry containing
 231          * btree node updates that have not yet been written out
 232          */
 233         for_each_rw_member(ca, c, iter) {
 234                 struct journal_device *ja = &ca->journal;
 235
 236                 if (!ja->nr)
 237                         continue;
 238
 239                 while (should_discard_bucket(j, ja)) {
 240                         if (!reclaim_lock_held) {
 241                                 /*
 242                                  * ugh:
 243                                  * might be called from __journal_res_get()
 244                                  * under wait_event() - have to go back to
 245                                  * TASK_RUNNING before doing something that
 246                                  * would block, but only if we're doing work:
 247                                  */
 248                                 __set_current_state(TASK_RUNNING);
 249
 250                                 mutex_lock(&j->reclaim_lock);
 251                                 reclaim_lock_held = true;
 252                                 /* recheck under reclaim_lock: */
 253                                 continue;
 254                         }
 255
 256                         if (ca->mi.discard &&
 257                             blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
 258                                 blkdev_issue_discard(ca->disk_sb.bdev,
 259                                         bucket_to_sector(ca,
 260                                                 ja->buckets[ja->last_idx]),
 261                                         ca->mi.bucket_size, GFP_NOIO, 0);
 262
 263                         spin_lock(&j->lock);
 264                         ja->last_idx = (ja->last_idx + 1) % ja->nr;
 265                         spin_unlock(&j->lock);
 266
 267                         journal_wake(j);
 268                 }
 269
 270                 /*
 271                  * Write out enough btree nodes to free up 50% journal
 272                  * buckets
 273                  */
 274                 spin_lock(&j->lock);
 275                 bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
 276                 seq_to_flush = max_t(u64, seq_to_flush,
 277                                      ja->bucket_seq[bucket_to_flush]);
 278                 spin_unlock(&j->lock);
 279         }
 280
 281         if (reclaim_lock_held)
 282                 mutex_unlock(&j->reclaim_lock);
 283
 284         /* Also flush if the pin fifo is more than half full */
 285         spin_lock(&j->lock);
 286         seq_to_flush = max_t(s64, seq_to_flush,
 287                              (s64) journal_cur_seq(j) -
 288                              (j->pin.size >> 1));
 289         spin_unlock(&j->lock);
 290
 291         /*
 292          * If it's been longer than j->reclaim_delay_ms since we last flushed,
 293          * make sure to flush at least one journal pin:
 294          */
 295         next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
 296         need_flush = time_after(jiffies, next_flush);
 297
 298         while ((pin = journal_get_next_pin(j, need_flush
 299                                            ? U64_MAX
 300                                            : seq_to_flush, &seq))) {
 301                 __set_current_state(TASK_RUNNING);
 302                 pin->flush(j, pin, seq);
 303                 need_flush = false;
 304
 305                 j->last_flushed = jiffies;
 306         }
 307
 308         if (!test_bit(BCH_FS_RO, &c->flags))
 309                 queue_delayed_work(system_freezable_wq, &j->reclaim_work,
 310                                    msecs_to_jiffies(j->reclaim_delay_ms));
 311 }
 312
 313 static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 314                               struct journal_entry_pin **pin,
 315                               u64 *pin_seq)
 316 {
 317         int ret;
 318
 319         *pin = NULL;
 320
 321         ret = bch2_journal_error(j);
 322         if (ret)
 323                 return ret;
 324
 325         spin_lock(&j->lock);
 326         /*
 327          * If journal replay hasn't completed, the unreplayed journal entries
 328          * hold refs on their corresponding sequence numbers
 329          */
 330         ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
 331                 !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
 332                 journal_last_seq(j) > seq_to_flush ||
 333                 (fifo_used(&j->pin) == 1 &&
 334                  atomic_read(&fifo_peek_front(&j->pin).count) == 1);
 335         spin_unlock(&j->lock);
 336
 337         return ret;
 338 }
 339
 340 int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 341 {
 342         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 343         struct journal_entry_pin *pin;
 344         u64 pin_seq;
 345         bool flush;
 346
 347         if (!test_bit(JOURNAL_STARTED, &j->flags))
 348                 return 0;
 349 again:
 350         wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
 351         if (pin) {
 352                 /* flushing a journal pin might cause a new one to be added: */
 353                 pin->flush(j, pin, pin_seq);
 354                 goto again;
 355         }
 356
 357         spin_lock(&j->lock);
 358         flush = journal_last_seq(j) != j->last_seq_ondisk ||
 359                 (seq_to_flush == U64_MAX && c->btree_roots_dirty);
 360         spin_unlock(&j->lock);
 361
 362         return flush ? bch2_journal_meta(j) : 0;
 363 }
 364
 365 int bch2_journal_flush_all_pins(struct journal *j)
 366 {
 367         return bch2_journal_flush_pins(j, U64_MAX);
 368 }
 369
 370 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 371 {
 372         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 373         struct journal_entry_pin_list *p;
 374         struct bch_devs_list devs;
 375         u64 iter, seq = 0;
 376         int ret = 0;
 377
 378         spin_lock(&j->lock);
 379         fifo_for_each_entry_ptr(p, &j->pin, iter)
 380                 if (dev_idx >= 0
 381                     ? bch2_dev_list_has_dev(p->devs, dev_idx)
 382                     : p->devs.nr < c->opts.metadata_replicas)
 383                         seq = iter;
 384         spin_unlock(&j->lock);
 385
 386         ret = bch2_journal_flush_pins(j, seq);
 387         if (ret)
 388                 return ret;
 389
 390         mutex_lock(&c->replicas_gc_lock);
 391         bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
 392
 393         seq = 0;
 394
 395         spin_lock(&j->lock);
 396         while (!ret && seq < j->pin.back) {
 397                 seq = max(seq, journal_last_seq(j));
 398                 devs = journal_seq_pin(j, seq)->devs;
 399                 seq++;
 400
 401                 spin_unlock(&j->lock);
 402                 ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
 403                 spin_lock(&j->lock);
 404         }
 405         spin_unlock(&j->lock);
 406
 407         bch2_replicas_gc_end(c, ret);
 408         mutex_unlock(&c->replicas_gc_lock);
 409
 410         return ret;
 411 }