git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/journal.c

   1 /*
   2  * bcachefs journalling code, for btree insertions
   3  *
   4  * Copyright 2012 Google, Inc.
   5  */
   6
   7 #include "bcachefs.h"
   8 #include "alloc_foreground.h"
   9 #include "bkey_methods.h"
  10 #include "btree_gc.h"
  11 #include "buckets.h"
  12 #include "journal.h"
  13 #include "journal_io.h"
  14 #include "journal_reclaim.h"
  15 #include "journal_seq_blacklist.h"
  16 #include "super-io.h"
  17
  18 #include <trace/events/bcachefs.h>
  19
  20 static bool __journal_entry_is_open(union journal_res_state state)
  21 {
  22         return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
  23 }
  24
  25 static bool journal_entry_is_open(struct journal *j)
  26 {
  27         return __journal_entry_is_open(j->reservations);
  28 }
  29
  30 static void journal_pin_new_entry(struct journal *j, int count)
  31 {
  32         struct journal_entry_pin_list *p;
  33
  34         /*
  35          * The fifo_push() needs to happen at the same time as j->seq is
  36          * incremented for journal_last_seq() to be calculated correctly
  37          */
  38         atomic64_inc(&j->seq);
  39         p = fifo_push_ref(&j->pin);
  40
  41         INIT_LIST_HEAD(&p->list);
  42         INIT_LIST_HEAD(&p->flushed);
  43         atomic_set(&p->count, count);
  44         p->devs.nr = 0;
  45 }
  46
  47 static void bch2_journal_buf_init(struct journal *j)
  48 {
  49         struct journal_buf *buf = journal_cur_buf(j);
  50
  51         memset(buf->has_inode, 0, sizeof(buf->has_inode));
  52
  53         memset(buf->data, 0, sizeof(*buf->data));
  54         buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
  55         buf->data->u64s = 0;
  56 }
  57
  58 static inline bool journal_entry_empty(struct jset *j)
  59 {
  60         struct jset_entry *i;
  61
  62         if (j->seq != j->last_seq)
  63                 return false;
  64
  65         vstruct_for_each(j, i)
  66                 if (i->type || i->u64s)
  67                         return false;
  68         return true;
  69 }
  70
  71 void bch2_journal_halt(struct journal *j)
  72 {
  73         union journal_res_state old, new;
  74         u64 v = atomic64_read(&j->reservations.counter);
  75
  76         do {
  77                 old.v = new.v = v;
  78                 if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
  79                         return;
  80
  81                 new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
  82         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
  83                                        old.v, new.v)) != old.v);
  84
  85         journal_wake(j);
  86         closure_wake_up(&journal_cur_buf(j)->wait);
  87 }
  88
  89 /* journal entry close/open: */
  90
  91 void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
  92 {
  93         if (!need_write_just_set &&
  94             test_bit(JOURNAL_NEED_WRITE, &j->flags))
  95                 bch2_time_stats_update(j->delay_time,
  96                                        j->need_write_time);
  97
  98         clear_bit(JOURNAL_NEED_WRITE, &j->flags);
  99
 100         closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
 101 }
 102
 103 /*
 104  * Returns true if journal entry is now closed:
 105  */
 106 static bool __journal_entry_close(struct journal *j)
 107 {
 108         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 109         struct journal_buf *buf = journal_cur_buf(j);
 110         union journal_res_state old, new;
 111         u64 v = atomic64_read(&j->reservations.counter);
 112         bool set_need_write = false;
 113         unsigned sectors;
 114
 115         lockdep_assert_held(&j->lock);
 116
 117         do {
 118                 old.v = new.v = v;
 119                 if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
 120                         return true;
 121
 122                 if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
 123                         /* this entry will never be written: */
 124                         closure_wake_up(&buf->wait);
 125                         return true;
 126                 }
 127
 128                 if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
 129                         set_bit(JOURNAL_NEED_WRITE, &j->flags);
 130                         j->need_write_time = local_clock();
 131                         set_need_write = true;
 132                 }
 133
 134                 if (new.prev_buf_unwritten)
 135                         return false;
 136
 137                 new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
 138                 new.idx++;
 139                 new.prev_buf_unwritten = 1;
 140
 141                 BUG_ON(journal_state_count(new, new.idx));
 142         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
 143                                        old.v, new.v)) != old.v);
 144
 145         buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 146
 147         sectors = vstruct_blocks_plus(buf->data, c->block_bits,
 148                                       buf->u64s_reserved) << c->block_bits;
 149         BUG_ON(sectors > buf->sectors);
 150         buf->sectors = sectors;
 151
 152         bkey_extent_init(&buf->key);
 153
 154         /*
 155          * We have to set last_seq here, _before_ opening a new journal entry:
 156          *
 157          * A threads may replace an old pin with a new pin on their current
 158          * journal reservation - the expectation being that the journal will
 159          * contain either what the old pin protected or what the new pin
 160          * protects.
 161          *
 162          * After the old pin is dropped journal_last_seq() won't include the old
 163          * pin, so we can only write the updated last_seq on the entry that
 164          * contains whatever the new pin protects.
 165          *
 166          * Restated, we can _not_ update last_seq for a given entry if there
 167          * could be a newer entry open with reservations/pins that have been
 168          * taken against it.
 169          *
 170          * Hence, we want update/set last_seq on the current journal entry right
 171          * before we open a new one:
 172          */
 173         buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 174
 175         if (journal_entry_empty(buf->data))
 176                 clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
 177         else
 178                 set_bit(JOURNAL_NOT_EMPTY, &j->flags);
 179
 180         journal_pin_new_entry(j, 1);
 181
 182         bch2_journal_buf_init(j);
 183
 184         cancel_delayed_work(&j->write_work);
 185
 186         bch2_journal_space_available(j);
 187
 188         bch2_journal_buf_put(j, old.idx, set_need_write);
 189         return true;
 190 }
 191
 192 static bool journal_entry_close(struct journal *j)
 193 {
 194         bool ret;
 195
 196         spin_lock(&j->lock);
 197         ret = __journal_entry_close(j);
 198         spin_unlock(&j->lock);
 199
 200         return ret;
 201 }
 202
 203 /*
 204  * should _only_ called from journal_res_get() - when we actually want a
 205  * journal reservation - journal entry is open means journal is dirty:
 206  *
 207  * returns:
 208  * 0:           success
 209  * -ENOSPC:     journal currently full, must invoke reclaim
 210  * -EAGAIN:     journal blocked, must wait
 211  * -EROFS:      insufficient rw devices or journal error
 212  */
 213 static int journal_entry_open(struct journal *j)
 214 {
 215         struct journal_buf *buf = journal_cur_buf(j);
 216         union journal_res_state old, new;
 217         int u64s;
 218         u64 v;
 219
 220         lockdep_assert_held(&j->lock);
 221         BUG_ON(journal_entry_is_open(j));
 222
 223         if (j->blocked)
 224                 return -EAGAIN;
 225
 226         if (j->cur_entry_error)
 227                 return j->cur_entry_error;
 228
 229         BUG_ON(!j->cur_entry_sectors);
 230
 231         buf->u64s_reserved      = j->entry_u64s_reserved;
 232         buf->disk_sectors       = j->cur_entry_sectors;
 233         buf->sectors            = min(buf->disk_sectors, buf->buf_size >> 9);
 234
 235         u64s = (int) (buf->sectors << 9) / sizeof(u64) -
 236                 journal_entry_overhead(j);
 237         u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 238
 239         if (u64s <= le32_to_cpu(buf->data->u64s))
 240                 return -ENOSPC;
 241
 242         /*
 243          * Must be set before marking the journal entry as open:
 244          */
 245         j->cur_entry_u64s = u64s;
 246
 247         v = atomic64_read(&j->reservations.counter);
 248         do {
 249                 old.v = new.v = v;
 250
 251                 EBUG_ON(journal_state_count(new, new.idx));
 252
 253                 if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
 254                         return -EROFS;
 255
 256                 /* Handle any already added entries */
 257                 new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
 258                 journal_state_inc(&new);
 259         } while ((v = atomic64_cmpxchg(&j->reservations.counter,
 260                                        old.v, new.v)) != old.v);
 261
 262         if (j->res_get_blocked_start)
 263                 bch2_time_stats_update(j->blocked_time,
 264                                        j->res_get_blocked_start);
 265         j->res_get_blocked_start = 0;
 266
 267         mod_delayed_work(system_freezable_wq,
 268                          &j->write_work,
 269                          msecs_to_jiffies(j->write_delay_ms));
 270         journal_wake(j);
 271         return 0;
 272 }
 273
 274 static bool journal_quiesced(struct journal *j)
 275 {
 276         union journal_res_state state = READ_ONCE(j->reservations);
 277         bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
 278
 279         if (!ret)
 280                 journal_entry_close(j);
 281         return ret;
 282 }
 283
 284 static void journal_quiesce(struct journal *j)
 285 {
 286         wait_event(j->wait, journal_quiesced(j));
 287 }
 288
 289 static void journal_write_work(struct work_struct *work)
 290 {
 291         struct journal *j = container_of(work, struct journal, write_work.work);
 292
 293         journal_entry_close(j);
 294 }
 295
 296 /*
 297  * Given an inode number, if that inode number has data in the journal that
 298  * hasn't yet been flushed, return the journal sequence number that needs to be
 299  * flushed:
 300  */
 301 u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 302 {
 303         size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
 304         u64 seq = 0;
 305
 306         if (!test_bit(h, j->buf[0].has_inode) &&
 307             !test_bit(h, j->buf[1].has_inode))
 308                 return 0;
 309
 310         spin_lock(&j->lock);
 311         if (test_bit(h, journal_cur_buf(j)->has_inode))
 312                 seq = journal_cur_seq(j);
 313         else if (test_bit(h, journal_prev_buf(j)->has_inode))
 314                 seq = journal_cur_seq(j) - 1;
 315         spin_unlock(&j->lock);
 316
 317         return seq;
 318 }
 319
 320 static int __journal_res_get(struct journal *j, struct journal_res *res,
 321                              unsigned flags)
 322 {
 323         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 324         struct journal_buf *buf;
 325         int ret;
 326 retry:
 327         if (journal_res_get_fast(j, res, flags))
 328                 return 0;
 329
 330         if (bch2_journal_error(j))
 331                 return -EROFS;
 332
 333         spin_lock(&j->lock);
 334
 335         /*
 336          * Recheck after taking the lock, so we don't race with another thread
 337          * that just did journal_entry_open() and call journal_entry_close()
 338          * unnecessarily
 339          */
 340         if (journal_res_get_fast(j, res, flags)) {
 341                 spin_unlock(&j->lock);
 342                 return 0;
 343         }
 344
 345         /*
 346          * If we couldn't get a reservation because the current buf filled up,
 347          * and we had room for a bigger entry on disk, signal that we want to
 348          * realloc the journal bufs:
 349          */
 350         buf = journal_cur_buf(j);
 351         if (journal_entry_is_open(j) &&
 352             buf->buf_size >> 9 < buf->disk_sectors &&
 353             buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
 354                 j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 355
 356         if (journal_entry_is_open(j) &&
 357             !__journal_entry_close(j)) {
 358                 /*
 359                  * We failed to get a reservation on the current open journal
 360                  * entry because it's full, and we can't close it because
 361                  * there's still a previous one in flight:
 362                  */
 363                 trace_journal_entry_full(c);
 364                 ret = -EAGAIN;
 365         } else {
 366                 ret = journal_entry_open(j);
 367         }
 368
 369         if ((ret == -EAGAIN || ret == -ENOSPC) &&
 370             !j->res_get_blocked_start)
 371                 j->res_get_blocked_start = local_clock() ?: 1;
 372
 373         spin_unlock(&j->lock);
 374
 375         if (!ret)
 376                 goto retry;
 377         if (ret == -ENOSPC) {
 378                 /*
 379                  * Journal is full - can't rely on reclaim from work item due to
 380                  * freezing:
 381                  */
 382                 trace_journal_full(c);
 383                 if (!(flags & JOURNAL_RES_GET_NONBLOCK))
 384                         bch2_journal_reclaim_work(&j->reclaim_work.work);
 385                 ret = -EAGAIN;
 386         }
 387
 388         return ret;
 389 }
 390
 391 /*
 392  * Essentially the entry function to the journaling code. When bcachefs is doing
 393  * a btree insert, it calls this function to get the current journal write.
 394  * Journal write is the structure used set up journal writes. The calling
 395  * function will then add its keys to the structure, queuing them for the next
 396  * write.
 397  *
 398  * To ensure forward progress, the current task must not be holding any
 399  * btree node write locks.
 400  */
 401 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 402                                   unsigned flags)
 403 {
 404         int ret;
 405
 406         closure_wait_event(&j->async_wait,
 407                    (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
 408                    (flags & JOURNAL_RES_GET_NONBLOCK));
 409         return ret;
 410 }
 411
 412 /* journal_entry_res: */
 413
 414 void bch2_journal_entry_res_resize(struct journal *j,
 415                                    struct journal_entry_res *res,
 416                                    unsigned new_u64s)
 417 {
 418         union journal_res_state state;
 419         int d = new_u64s - res->u64s;
 420
 421         spin_lock(&j->lock);
 422
 423         j->entry_u64s_reserved += d;
 424         if (d <= 0)
 425                 goto out;
 426
 427         j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
 428         smp_mb();
 429         state = READ_ONCE(j->reservations);
 430
 431         if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
 432             state.cur_entry_offset > j->cur_entry_u64s) {
 433                 j->cur_entry_u64s += d;
 434                 /*
 435                  * Not enough room in current journal entry, have to flush it:
 436                  */
 437                 __journal_entry_close(j);
 438         } else {
 439                 journal_cur_buf(j)->u64s_reserved += d;
 440         }
 441 out:
 442         spin_unlock(&j->lock);
 443         res->u64s += d;
 444 }
 445
 446 /* journal flushing: */
 447
 448 u64 bch2_journal_last_unwritten_seq(struct journal *j)
 449 {
 450         u64 seq;
 451
 452         spin_lock(&j->lock);
 453         seq = journal_cur_seq(j);
 454         if (j->reservations.prev_buf_unwritten)
 455                 seq--;
 456         spin_unlock(&j->lock);
 457
 458         return seq;
 459 }
 460
 461 /**
 462  * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
 463  * open yet, or wait if we cannot
 464  *
 465  * used by the btree interior update machinery, when it needs to write a new
 466  * btree root - every journal entry contains the roots of all the btrees, so it
 467  * doesn't need to bother with getting a journal reservation
 468  */
 469 int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 470 {
 471         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 472         int ret;
 473
 474         spin_lock(&j->lock);
 475
 476         /*
 477          * Can't try to open more than one sequence number ahead:
 478          */
 479         BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
 480
 481         if (journal_cur_seq(j) > seq ||
 482             journal_entry_is_open(j)) {
 483                 spin_unlock(&j->lock);
 484                 return 0;
 485         }
 486
 487         if (journal_cur_seq(j) < seq &&
 488             !__journal_entry_close(j)) {
 489                 /* haven't finished writing out the previous one: */
 490                 trace_journal_entry_full(c);
 491                 ret = -EAGAIN;
 492         } else {
 493                 BUG_ON(journal_cur_seq(j) != seq);
 494
 495                 ret = journal_entry_open(j);
 496         }
 497
 498         if ((ret == -EAGAIN || ret == -ENOSPC) &&
 499             !j->res_get_blocked_start)
 500                 j->res_get_blocked_start = local_clock() ?: 1;
 501
 502         if (ret == -EAGAIN || ret == -ENOSPC)
 503                 closure_wait(&j->async_wait, cl);
 504
 505         spin_unlock(&j->lock);
 506
 507         if (ret == -ENOSPC) {
 508                 trace_journal_full(c);
 509                 bch2_journal_reclaim_work(&j->reclaim_work.work);
 510                 ret = -EAGAIN;
 511         }
 512
 513         return ret;
 514 }
 515
 516 static int journal_seq_error(struct journal *j, u64 seq)
 517 {
 518         union journal_res_state state = READ_ONCE(j->reservations);
 519
 520         if (seq == journal_cur_seq(j))
 521                 return bch2_journal_error(j);
 522
 523         if (seq + 1 == journal_cur_seq(j) &&
 524             !state.prev_buf_unwritten &&
 525             seq > j->seq_ondisk)
 526                 return -EIO;
 527
 528         return 0;
 529 }
 530
 531 static inline struct journal_buf *
 532 journal_seq_to_buf(struct journal *j, u64 seq)
 533 {
 534         /* seq should be for a journal entry that has been opened: */
 535         BUG_ON(seq > journal_cur_seq(j));
 536         BUG_ON(seq == journal_cur_seq(j) &&
 537                j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
 538
 539         if (seq == journal_cur_seq(j))
 540                 return journal_cur_buf(j);
 541         if (seq + 1 == journal_cur_seq(j) &&
 542             j->reservations.prev_buf_unwritten)
 543                 return journal_prev_buf(j);
 544         return NULL;
 545 }
 546
 547 /**
 548  * bch2_journal_wait_on_seq - wait for a journal entry to be written
 549  *
 550  * does _not_ cause @seq to be written immediately - if there is no other
 551  * activity to cause the relevant journal entry to be filled up or flushed it
 552  * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
 553  * configurable).
 554  */
 555 void bch2_journal_wait_on_seq(struct journal *j, u64 seq,
 556                               struct closure *parent)
 557 {
 558         struct journal_buf *buf;
 559
 560         spin_lock(&j->lock);
 561
 562         if ((buf = journal_seq_to_buf(j, seq))) {
 563                 if (!closure_wait(&buf->wait, parent))
 564                         BUG();
 565
 566                 if (seq == journal_cur_seq(j)) {
 567                         smp_mb();
 568                         if (bch2_journal_error(j))
 569                                 closure_wake_up(&buf->wait);
 570                 }
 571         }
 572
 573         spin_unlock(&j->lock);
 574 }
 575
 576 /**
 577  * bch2_journal_flush_seq_async - wait for a journal entry to be written
 578  *
 579  * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
 580  * necessary
 581  */
 582 void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 583                                   struct closure *parent)
 584 {
 585         struct journal_buf *buf;
 586
 587         spin_lock(&j->lock);
 588
 589         if (parent &&
 590             (buf = journal_seq_to_buf(j, seq)))
 591                 if (!closure_wait(&buf->wait, parent))
 592                         BUG();
 593
 594         if (seq == journal_cur_seq(j))
 595                 __journal_entry_close(j);
 596         spin_unlock(&j->lock);
 597 }
 598
 599 static int journal_seq_flushed(struct journal *j, u64 seq)
 600 {
 601         int ret;
 602
 603         spin_lock(&j->lock);
 604         ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq);
 605
 606         if (seq == journal_cur_seq(j))
 607                 __journal_entry_close(j);
 608         spin_unlock(&j->lock);
 609
 610         return ret;
 611 }
 612
 613 int bch2_journal_flush_seq(struct journal *j, u64 seq)
 614 {
 615         u64 start_time = local_clock();
 616         int ret, ret2;
 617
 618         ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
 619
 620         bch2_time_stats_update(j->flush_seq_time, start_time);
 621
 622         return ret ?: ret2 < 0 ? ret2 : 0;
 623 }
 624
 625 /**
 626  * bch2_journal_meta_async - force a journal entry to be written
 627  */
 628 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 629 {
 630         struct journal_res res;
 631
 632         memset(&res, 0, sizeof(res));
 633
 634         bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 635         bch2_journal_res_put(j, &res);
 636
 637         bch2_journal_flush_seq_async(j, res.seq, parent);
 638 }
 639
 640 int bch2_journal_meta(struct journal *j)
 641 {
 642         struct journal_res res;
 643         int ret;
 644
 645         memset(&res, 0, sizeof(res));
 646
 647         ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
 648         if (ret)
 649                 return ret;
 650
 651         bch2_journal_res_put(j, &res);
 652
 653         return bch2_journal_flush_seq(j, res.seq);
 654 }
 655
 656 /*
 657  * bch2_journal_flush_async - if there is an open journal entry, or a journal
 658  * still being written, write it and wait for the write to complete
 659  */
 660 void bch2_journal_flush_async(struct journal *j, struct closure *parent)
 661 {
 662         u64 seq, journal_seq;
 663
 664         spin_lock(&j->lock);
 665         journal_seq = journal_cur_seq(j);
 666
 667         if (journal_entry_is_open(j)) {
 668                 seq = journal_seq;
 669         } else if (journal_seq) {
 670                 seq = journal_seq - 1;
 671         } else {
 672                 spin_unlock(&j->lock);
 673                 return;
 674         }
 675         spin_unlock(&j->lock);
 676
 677         bch2_journal_flush_seq_async(j, seq, parent);
 678 }
 679
 680 int bch2_journal_flush(struct journal *j)
 681 {
 682         u64 seq, journal_seq;
 683
 684         spin_lock(&j->lock);
 685         journal_seq = journal_cur_seq(j);
 686
 687         if (journal_entry_is_open(j)) {
 688                 seq = journal_seq;
 689         } else if (journal_seq) {
 690                 seq = journal_seq - 1;
 691         } else {
 692                 spin_unlock(&j->lock);
 693                 return 0;
 694         }
 695         spin_unlock(&j->lock);
 696
 697         return bch2_journal_flush_seq(j, seq);
 698 }
 699
 700 /* block/unlock the journal: */
 701
 702 void bch2_journal_unblock(struct journal *j)
 703 {
 704         spin_lock(&j->lock);
 705         j->blocked--;
 706         spin_unlock(&j->lock);
 707
 708         journal_wake(j);
 709 }
 710
 711 void bch2_journal_block(struct journal *j)
 712 {
 713         spin_lock(&j->lock);
 714         j->blocked++;
 715         spin_unlock(&j->lock);
 716
 717         journal_quiesce(j);
 718 }
 719
 720 /* allocate journal on a device: */
 721
 722 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 723                                          bool new_fs, struct closure *cl)
 724 {
 725         struct bch_fs *c = ca->fs;
 726         struct journal_device *ja = &ca->journal;
 727         struct bch_sb_field_journal *journal_buckets;
 728         u64 *new_bucket_seq = NULL, *new_buckets = NULL;
 729         int ret = 0;
 730
 731         /* don't handle reducing nr of buckets yet: */
 732         if (nr <= ja->nr)
 733                 return 0;
 734
 735         ret = -ENOMEM;
 736         new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
 737         new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
 738         if (!new_buckets || !new_bucket_seq)
 739                 goto err;
 740
 741         journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
 742                                                  nr + sizeof(*journal_buckets) / sizeof(u64));
 743         if (!journal_buckets)
 744                 goto err;
 745
 746         /*
 747          * We may be called from the device add path, before the new device has
 748          * actually been added to the running filesystem:
 749          */
 750         if (c)
 751                 spin_lock(&c->journal.lock);
 752
 753         memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
 754         memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
 755         swap(new_buckets,       ja->buckets);
 756         swap(new_bucket_seq,    ja->bucket_seq);
 757
 758         if (c)
 759                 spin_unlock(&c->journal.lock);
 760
 761         while (ja->nr < nr) {
 762                 struct open_bucket *ob = NULL;
 763                 long bucket;
 764
 765                 if (new_fs) {
 766                         bucket = bch2_bucket_alloc_new_fs(ca);
 767                         if (bucket < 0) {
 768                                 ret = -ENOSPC;
 769                                 goto err;
 770                         }
 771                 } else {
 772                         ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
 773                                                false, cl);
 774                         if (IS_ERR(ob)) {
 775                                 ret = cl ? -EAGAIN : -ENOSPC;
 776                                 goto err;
 777                         }
 778
 779                         bucket = sector_to_bucket(ca, ob->ptr.offset);
 780                 }
 781
 782                 if (c) {
 783                         percpu_down_read_preempt_disable(&c->mark_lock);
 784                         spin_lock(&c->journal.lock);
 785                 } else {
 786                         preempt_disable();
 787                 }
 788
 789                 __array_insert_item(ja->buckets,                ja->nr, ja->last_idx);
 790                 __array_insert_item(ja->bucket_seq,             ja->nr, ja->last_idx);
 791                 __array_insert_item(journal_buckets->buckets,   ja->nr, ja->last_idx);
 792
 793                 ja->buckets[ja->last_idx] = bucket;
 794                 ja->bucket_seq[ja->last_idx] = 0;
 795                 journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
 796
 797                 if (ja->last_idx < ja->nr) {
 798                         if (ja->cur_idx >= ja->last_idx)
 799                                 ja->cur_idx++;
 800                         ja->last_idx++;
 801                 }
 802                 ja->nr++;
 803
 804                 bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
 805                                           ca->mi.bucket_size,
 806                                           gc_phase(GC_PHASE_SB),
 807                                           0);
 808
 809                 if (c) {
 810                         spin_unlock(&c->journal.lock);
 811                         percpu_up_read_preempt_enable(&c->mark_lock);
 812                 } else {
 813                         preempt_enable();
 814                 }
 815
 816                 if (!new_fs)
 817                         bch2_open_bucket_put(c, ob);
 818         }
 819
 820         ret = 0;
 821 err:
 822         kfree(new_bucket_seq);
 823         kfree(new_buckets);
 824
 825         return ret;
 826 }
 827
 828 /*
 829  * Allocate more journal space at runtime - not currently making use if it, but
 830  * the code works:
 831  */
 832 int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 833                                 unsigned nr)
 834 {
 835         struct journal_device *ja = &ca->journal;
 836         struct closure cl;
 837         unsigned current_nr;
 838         int ret;
 839
 840         closure_init_stack(&cl);
 841
 842         do {
 843                 struct disk_reservation disk_res = { 0, 0 };
 844
 845                 closure_sync(&cl);
 846
 847                 mutex_lock(&c->sb_lock);
 848                 current_nr = ja->nr;
 849
 850                 /*
 851                  * note: journal buckets aren't really counted as _sectors_ used yet, so
 852                  * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
 853                  * when space used goes up without a reservation - but we do need the
 854                  * reservation to ensure we'll actually be able to allocate:
 855                  */
 856
 857                 if (bch2_disk_reservation_get(c, &disk_res,
 858                                               bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
 859                         mutex_unlock(&c->sb_lock);
 860                         return -ENOSPC;
 861                 }
 862
 863                 ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
 864
 865                 bch2_disk_reservation_put(c, &disk_res);
 866
 867                 if (ja->nr != current_nr)
 868                         bch2_write_super(c);
 869                 mutex_unlock(&c->sb_lock);
 870         } while (ret == -EAGAIN);
 871
 872         return ret;
 873 }
 874
 875 int bch2_dev_journal_alloc(struct bch_dev *ca)
 876 {
 877         unsigned nr;
 878
 879         if (dynamic_fault("bcachefs:add:journal_alloc"))
 880                 return -ENOMEM;
 881
 882         /*
 883          * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
 884          * is smaller:
 885          */
 886         nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
 887                      BCH_JOURNAL_BUCKETS_MIN,
 888                      min(1 << 10,
 889                          (1 << 20) / ca->mi.bucket_size));
 890
 891         return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 892 }
 893
 894 /* startup/shutdown: */
 895
 896 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 897 {
 898         union journal_res_state state;
 899         struct journal_buf *w;
 900         bool ret;
 901
 902         spin_lock(&j->lock);
 903         state = READ_ONCE(j->reservations);
 904         w = j->buf + !state.idx;
 905
 906         ret = state.prev_buf_unwritten &&
 907                 bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
 908         spin_unlock(&j->lock);
 909
 910         return ret;
 911 }
 912
 913 void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 914 {
 915         wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
 916 }
 917
 918 void bch2_fs_journal_stop(struct journal *j)
 919 {
 920         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 921
 922         wait_event(j->wait, journal_entry_close(j));
 923
 924         /* do we need to write another journal entry? */
 925         if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
 926             c->btree_roots_dirty)
 927                 bch2_journal_meta(j);
 928
 929         journal_quiesce(j);
 930
 931         BUG_ON(!bch2_journal_error(j) &&
 932                test_bit(JOURNAL_NOT_EMPTY, &j->flags));
 933
 934         cancel_delayed_work_sync(&j->write_work);
 935         cancel_delayed_work_sync(&j->reclaim_work);
 936 }
 937
 938 void bch2_fs_journal_start(struct journal *j)
 939 {
 940         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 941         struct journal_seq_blacklist *bl;
 942         u64 blacklist = 0;
 943
 944         list_for_each_entry(bl, &j->seq_blacklist, list)
 945                 blacklist = max(blacklist, bl->end);
 946
 947         spin_lock(&j->lock);
 948
 949         set_bit(JOURNAL_STARTED, &j->flags);
 950
 951         while (journal_cur_seq(j) < blacklist)
 952                 journal_pin_new_entry(j, 0);
 953
 954         /*
 955          * __journal_entry_close() only inits the next journal entry when it
 956          * closes an open journal entry - the very first journal entry gets
 957          * initialized here:
 958          */
 959         journal_pin_new_entry(j, 1);
 960         bch2_journal_buf_init(j);
 961
 962         c->last_bucket_seq_cleanup = journal_cur_seq(j);
 963
 964         bch2_journal_space_available(j);
 965         spin_unlock(&j->lock);
 966
 967         /*
 968          * Adding entries to the next journal entry before allocating space on
 969          * disk for the next journal entry - this is ok, because these entries
 970          * only have to go down with the next journal entry we write:
 971          */
 972         bch2_journal_seq_blacklist_write(j);
 973
 974         queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 975 }
 976
 977 /* init/exit: */
 978
 979 void bch2_dev_journal_exit(struct bch_dev *ca)
 980 {
 981         kfree(ca->journal.bio);
 982         kfree(ca->journal.buckets);
 983         kfree(ca->journal.bucket_seq);
 984
 985         ca->journal.bio         = NULL;
 986         ca->journal.buckets     = NULL;
 987         ca->journal.bucket_seq  = NULL;
 988 }
 989
 990 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 991 {
 992         struct journal_device *ja = &ca->journal;
 993         struct bch_sb_field_journal *journal_buckets =
 994                 bch2_sb_get_journal(sb);
 995         unsigned i;
 996
 997         ja->nr = bch2_nr_journal_buckets(journal_buckets);
 998
 999         ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
1000         if (!ja->bucket_seq)
1001                 return -ENOMEM;
1002
1003         ca->journal.bio = bio_kmalloc(GFP_KERNEL,
1004                         DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
1005         if (!ca->journal.bio)
1006                 return -ENOMEM;
1007
1008         ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
1009         if (!ja->buckets)
1010                 return -ENOMEM;
1011
1012         for (i = 0; i < ja->nr; i++)
1013                 ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
1014
1015         return 0;
1016 }
1017
1018 void bch2_fs_journal_exit(struct journal *j)
1019 {
1020         kvpfree(j->buf[1].data, j->buf[1].buf_size);
1021         kvpfree(j->buf[0].data, j->buf[0].buf_size);
1022         free_fifo(&j->pin);
1023 }
1024
1025 int bch2_fs_journal_init(struct journal *j)
1026 {
1027         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1028         static struct lock_class_key res_key;
1029         int ret = 0;
1030
1031         pr_verbose_init(c->opts, "");
1032
1033         spin_lock_init(&j->lock);
1034         spin_lock_init(&j->err_lock);
1035         init_waitqueue_head(&j->wait);
1036         INIT_DELAYED_WORK(&j->write_work, journal_write_work);
1037         INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
1038         init_waitqueue_head(&j->pin_flush_wait);
1039         mutex_init(&j->blacklist_lock);
1040         INIT_LIST_HEAD(&j->seq_blacklist);
1041         mutex_init(&j->reclaim_lock);
1042
1043         lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
1044
1045         j->buf[0].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
1046         j->buf[1].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
1047         j->write_delay_ms       = 1000;
1048         j->reclaim_delay_ms     = 100;
1049
1050         /* Btree roots: */
1051         j->entry_u64s_reserved +=
1052                 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
1053
1054         atomic64_set(&j->reservations.counter,
1055                 ((union journal_res_state)
1056                  { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
1057
1058         if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
1059             !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
1060             !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
1061                 ret = -ENOMEM;
1062                 goto out;
1063         }
1064
1065         j->pin.front = j->pin.back = 1;
1066 out:
1067         pr_verbose_init(c->opts, "ret %i", ret);
1068         return ret;
1069 }
1070
1071 /* debug: */
1072
1073 ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
1074 {
1075         struct printbuf out = _PBUF(buf, PAGE_SIZE);
1076         struct bch_fs *c = container_of(j, struct bch_fs, journal);
1077         union journal_res_state s;
1078         struct bch_dev *ca;
1079         unsigned iter;
1080
1081         rcu_read_lock();
1082         spin_lock(&j->lock);
1083         s = READ_ONCE(j->reservations);
1084
1085         pr_buf(&out,
1086                "active journal entries:\t%llu\n"
1087                "seq:\t\t\t%llu\n"
1088                "last_seq:\t\t%llu\n"
1089                "last_seq_ondisk:\t%llu\n"
1090                "current entry:\t\t",
1091                fifo_used(&j->pin),
1092                journal_cur_seq(j),
1093                journal_last_seq(j),
1094                j->last_seq_ondisk);
1095
1096         switch (s.cur_entry_offset) {
1097         case JOURNAL_ENTRY_ERROR_VAL:
1098                 pr_buf(&out, "error\n");
1099                 break;
1100         case JOURNAL_ENTRY_CLOSED_VAL:
1101                 pr_buf(&out, "closed\n");
1102                 break;
1103         default:
1104                 pr_buf(&out, "%u/%u\n",
1105                        s.cur_entry_offset,
1106                        j->cur_entry_u64s);
1107                 break;
1108         }
1109
1110         pr_buf(&out,
1111                "current entry refs:\t%u\n"
1112                "prev entry unwritten:\t",
1113                journal_state_count(s, s.idx));
1114
1115         if (s.prev_buf_unwritten)
1116                 pr_buf(&out, "yes, ref %u\n",
1117                        journal_state_count(s, !s.idx));
1118         else
1119                 pr_buf(&out, "no\n");
1120
1121         pr_buf(&out,
1122                "need write:\t\t%i\n"
1123                "replay done:\t\t%i\n",
1124                test_bit(JOURNAL_NEED_WRITE,     &j->flags),
1125                test_bit(JOURNAL_REPLAY_DONE,    &j->flags));
1126
1127         for_each_member_device_rcu(ca, c, iter,
1128                                    &c->rw_devs[BCH_DATA_JOURNAL]) {
1129                 struct journal_device *ja = &ca->journal;
1130
1131                 if (!ja->nr)
1132                         continue;
1133
1134                 pr_buf(&out,
1135                        "dev %u:\n"
1136                        "\tnr\t\t%u\n"
1137                        "\tavailable\t%u:%u\n"
1138                        "\tcur_idx\t\t%u (seq %llu)\n"
1139                        "\tlast_idx\t%u (seq %llu)\n",
1140                        iter, ja->nr,
1141                        bch2_journal_dev_buckets_available(j, ja),
1142                        ja->sectors_free,
1143                        ja->cur_idx,     ja->bucket_seq[ja->cur_idx],
1144                        ja->last_idx,    ja->bucket_seq[ja->last_idx]);
1145         }
1146
1147         spin_unlock(&j->lock);
1148         rcu_read_unlock();
1149
1150         return out.pos - buf;
1151 }
1152
1153 ssize_t bch2_journal_print_pins(struct journal *j, char *buf)
1154 {
1155         struct printbuf out = _PBUF(buf, PAGE_SIZE);
1156         struct journal_entry_pin_list *pin_list;
1157         struct journal_entry_pin *pin;
1158         u64 i;
1159
1160         spin_lock(&j->lock);
1161         fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
1162                 pr_buf(&out, "%llu: count %u\n",
1163                        i, atomic_read(&pin_list->count));
1164
1165                 list_for_each_entry(pin, &pin_list->list, list)
1166                         pr_buf(&out, "\t%p %pf\n",
1167                                pin, pin->flush);
1168
1169                 if (!list_empty(&pin_list->flushed))
1170                         pr_buf(&out, "flushed:\n");
1171
1172                 list_for_each_entry(pin, &pin_list->flushed, list)
1173                         pr_buf(&out, "\t%p %pf\n",
1174                                pin, pin->flush);
1175         }
1176         spin_unlock(&j->lock);
1177
1178         return out.pos - buf;
1179 }