- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-
- sectors = vstruct_sectors(j, c->block_bits);
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
- return 0;
-}
-
-static void bch2_journal_read_device(struct closure *cl)
-{
-#define read_bucket(b) \
- ({ \
- bool entries_found = false; \
- ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
- &entries_found); \
- if (ret) \
- goto err; \
- __set_bit(b, bitmap); \
- entries_found; \
- })
-
- struct journal_device *ja =
- container_of(cl, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
- struct journal_read_buf buf = { NULL, 0 };
-
- DECLARE_BITMAP(bitmap, ja->nr);
- unsigned i, l, r;
- u64 seq = 0;
- int ret;
-
- if (!ja->nr)
- goto out;
-
- bitmap_zero(bitmap, ja->nr);
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- /*
- * If the device supports discard but not secure discard, we can't do
- * the fancy fibonacci hash/binary search because the live journal
- * entries might not form a contiguous range:
- */
- for (i = 0; i < ja->nr; i++)
- read_bucket(i);
- goto search_done;
-
- if (!blk_queue_nonrot(q))
- goto linear_scan;
-
- /*
- * Read journal buckets ordered by golden ratio hash to quickly
- * find a sequence of buckets with valid journal entries
- */
- for (i = 0; i < ja->nr; i++) {
- l = (i * 2654435769U) % ja->nr;
-
- if (test_bit(l, bitmap))
- break;
-
- if (read_bucket(l))
- goto bsearch;
- }
-
- /*
- * If that fails, check all the buckets we haven't checked
- * already
- */
- pr_debug("falling back to linear search");
-linear_scan:
- for (l = find_first_zero_bit(bitmap, ja->nr);
- l < ja->nr;
- l = find_next_zero_bit(bitmap, ja->nr, l + 1))
- if (read_bucket(l))
- goto bsearch;
-
- /* no journal entries on this device? */
- if (l == ja->nr)
- goto out;
-bsearch:
- /* Binary search */
- r = find_next_bit(bitmap, ja->nr, l + 1);
- pr_debug("starting binary search, l %u r %u", l, r);
-
- while (l + 1 < r) {
- unsigned m = (l + r) >> 1;
- u64 cur_seq = seq;
-
- read_bucket(m);
-
- if (cur_seq != seq)
- l = m;
- else
- r = m;
- }
-
-search_done:
- /*
- * Find the journal bucket with the highest sequence number:
- *
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- seq = 0;
-
- for (i = 0; i < ja->nr; i++)
- if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
- /*
- * When journal_next_bucket() goes to allocate for
- * the first time, it'll use the bucket after
- * ja->cur_idx
- */
- ja->cur_idx = i;
- seq = ja->bucket_seq[i];
- }
-
- /*
- * Set last_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
- /*
- * Read buckets in reverse order until we stop finding more journal
- * entries:
- */
- for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
- i != ja->cur_idx;
- i = (i + ja->nr - 1) % ja->nr)
- if (!test_bit(i, bitmap) &&
- !read_bucket(i))
- break;
-out:
- kvpfree(buf.data, buf.size);
- percpu_ref_put(&ca->io_ref);
- closure_return(cl);
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-#undef read_bucket
-}
-
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
-static int journal_seq_blacklist_read(struct journal *j,
- struct journal_replay *i,
- struct journal_entry_pin_list *p)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *entry;
- struct journal_seq_blacklist *bl;
- u64 seq;
-
- for_each_jset_entry_type(entry, &i->j,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
- seq = le64_to_cpu(bl_entry->seq);
-
- bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
- bl = bch2_journal_seq_blacklisted_new(j, seq);
- if (!bl)
- return -ENOMEM;
-
- journal_pin_add_entry(j, p, &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- return 0;
-}
-
-static inline bool journal_has_keys(struct list_head *list)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- list_for_each_entry(i, list, list)
- for_each_jset_key(k, _n, entry, &i->j)
- return true;
-
- return false;
-}
-
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct journal_list jlist;
- struct journal_replay *i;
- struct journal_entry_pin_list *p;
- struct bch_dev *ca;
- u64 cur_seq, end_seq, seq;
- unsigned iter, keys = 0, entries = 0;
- size_t nr;
- bool degraded = false;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.head = list;
- jlist.ret = 0;
-
- for_each_member_device(ca, c, iter) {
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
- continue;
-
- if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
- ca->mi.state == BCH_MEMBER_STATE_RO) &&
- percpu_ref_tryget(&ca->io_ref))
- closure_call(&ca->journal.read,
- bch2_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- else
- degraded = true;
- }
-
- closure_sync(&jlist.cl);
-
- if (jlist.ret)
- return jlist.ret;
-
- if (list_empty(list)){
- bch_err(c, "no journal entries found");
- return BCH_FSCK_REPAIR_IMPOSSIBLE;
- }
-
- fsck_err_on(c->sb.clean && journal_has_keys(list), c,
- "filesystem marked clean but journal has keys to replay");
-
- list_for_each_entry(i, list, list) {
- ret = journal_entry_validate_entries(c, &i->j, READ);
- if (ret)
- goto fsck_err;
-
- /*
- * If we're mounting in degraded mode - if we didn't read all
- * the devices - this is wrong:
- */
-
- if (!degraded &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
- i->devs), c,
- "superblock not marked as containing replicas (type %u)",
- BCH_DATA_JOURNAL))) {
- ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
- if (ret)
- return ret;
- }
- }
-
- i = list_last_entry(list, struct journal_replay, list);
-
- nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
-
- if (nr > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
- return -ENOMEM;
- }
- }
-
- atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
- j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
- j->pin.front = le64_to_cpu(i->j.last_seq);
- j->pin.back = le64_to_cpu(i->j.seq) + 1;
-
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
- p->devs.nr = 0;
- }
-
- mutex_lock(&j->blacklist_lock);
-
- list_for_each_entry(i, list, list) {
- p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
-
- if (journal_seq_blacklist_read(j, i, p)) {
- mutex_unlock(&j->blacklist_lock);
- return -ENOMEM;
- }
- }
-
- mutex_unlock(&j->blacklist_lock);
-
- cur_seq = journal_last_seq(j);
- end_seq = le64_to_cpu(list_last_entry(list,
- struct journal_replay, list)->j.seq);
-
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
- bool blacklisted;
-
- mutex_lock(&j->blacklist_lock);
- while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(j, cur_seq))
- cur_seq++;
-
- blacklisted = journal_seq_blacklist_find(j,
- le64_to_cpu(i->j.seq));
- mutex_unlock(&j->blacklist_lock);
-
- fsck_err_on(blacklisted, c,
- "found blacklisted journal entry %llu",
- le64_to_cpu(i->j.seq));
-
- fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- cur_seq, le64_to_cpu(i->j.seq) - 1,
- journal_last_seq(j), end_seq);
-
- cur_seq = le64_to_cpu(i->j.seq) + 1;
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- entries++;
- }
-
- bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
- keys, entries, journal_cur_seq(j));
-fsck_err:
- return ret;
-}
-
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
- struct bkey_i *k, *n;
- struct jset_entry *j;
- struct journal_replay *r;
- int ret;
-
- list_for_each_entry(r, list, list)
- for_each_jset_key(k, n, j, &r->j) {
- enum bkey_type type = bkey_type(j->level, j->btree_id);
- struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
- if (btree_type_has_ptrs(type)) {
- ret = bch2_btree_mark_key_initial(c, type, k_s_c);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
-{
- struct journal_buf *w = journal_prev_buf(j);
-
- atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- __bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-#if 0
- closure_call(&j->io, journal_write, NULL, NULL);
-#else
- /* Shut sparse up: */
- closure_init(&j->io, NULL);
- set_closure_fn(&j->io, journal_write, NULL);
- journal_write(&j->io);
-#endif
-}
-
-static void journal_pin_new_entry(struct journal *j, int count)
-{
- struct journal_entry_pin_list *p;
-
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for journal_last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
- p = fifo_push_ref(&j->pin);
-
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, count);
- p->devs.nr = 0;
-}
-
-static void bch2_journal_buf_init(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
-
- memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(journal_cur_seq(j));
- buf->data->u64s = 0;
-}
-
-static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
-{
- return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-}
-
-static enum {
- JOURNAL_ENTRY_ERROR,
- JOURNAL_ENTRY_INUSE,
- JOURNAL_ENTRY_CLOSED,
- JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- lockdep_assert_held(&j->lock);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return JOURNAL_ENTRY_CLOSED;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return JOURNAL_ENTRY_ERROR;
-
- if (new.prev_buf_unwritten)
- return JOURNAL_ENTRY_INUSE;
-
- /*
- * avoid race between setting buf->data->u64s and
- * journal_res_put starting write:
- */
- journal_state_inc(&new);
-
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
- new.prev_buf_unwritten = 1;
-
- BUG_ON(journal_state_count(new, new.idx));
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- buf = &j->buf[old.idx];
- buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
-
- j->prev_buf_sectors =
- vstruct_blocks_plus(buf->data, c->block_bits,
- journal_entry_u64s_reserve(buf)) *
- c->opts.block_size;
- BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
-
- journal_reclaim_fast(j);
- /* XXX: why set this here, and not in journal_write()? */
- buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
-
- journal_pin_new_entry(j, 1);
-
- bch2_journal_buf_init(j);
-
- cancel_delayed_work(&j->write_work);
- spin_unlock(&j->lock);
-
- if (c->bucket_journal_seq > 1 << 14) {
- c->bucket_journal_seq = 0;
- bch2_bucket_seq_cleanup(c);
- }
-
- /* ugh - might be called from __journal_res_get() under wait_event() */
- __set_current_state(TASK_RUNNING);
- bch2_journal_buf_put(j, old.idx, need_write_just_set);
-
- return JOURNAL_UNLOCKED;
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
- closure_wake_up(&journal_prev_buf(j)->wait);
-}
-
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Hack to avoid a deadlock during journal replay:
- * journal replay might require setting a new btree
- * root, which requires writing another journal entry -
- * thus, if the journal is full (and this happens when
- * replaying the first journal bucket's entries) we're
- * screwed.
- *
- * So don't let the journal fill up unless we're in
- * replay:
- */
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- available = max((int) available - 2, 0);
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
- available = max((int) available - 1, 0);
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-static int journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = UINT_MAX;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_required = 0;
-
- if (!ja->nr)
- continue;
-
- sectors_available = min_t(unsigned, sectors_available,
- ca->mi.bucket_size);
-
- /*
- * Note that we don't allocate the space for a journal entry
- * until we write it out - thus, if we haven't started the write
- * for the previous entry we have to make sure we have space for
- * it too:
- */
- if (bch2_extent_has_device(e.c, ca->dev_idx)) {
- if (j->prev_buf_sectors > ja->sectors_free)
- buckets_required++;
-
- if (j->prev_buf_sectors + sectors_available >
- ja->sectors_free)
- buckets_required++;
- } else {
- if (j->prev_buf_sectors + sectors_available >
- ca->mi.bucket_size)
- buckets_required++;
-
- buckets_required++;
- }
-
- if (journal_dev_buckets_available(j, ca) >= buckets_required)
- nr_devs++;
- nr_online++;
- }
- rcu_read_unlock();
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- *
- * returns:
- * 1: success
- * 0: journal currently full (must wait)
- * -EROFS: insufficient rw devices
- * -EIO: journal error
- */
-static int journal_entry_open(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
- union journal_res_state old, new;
- ssize_t u64s;
- int sectors;
- u64 v;
-
- lockdep_assert_held(&j->lock);
- BUG_ON(journal_entry_is_open(j));
-
- if (!fifo_free(&j->pin))
- return 0;
-
- sectors = journal_entry_sectors(j);
- if (sectors <= 0)
- return sectors;
-
- buf->disk_sectors = sectors;
-
- sectors = min_t(unsigned, sectors, buf->size >> 9);
- j->cur_buf_sectors = sectors;
-
- u64s = (sectors << 9) / sizeof(u64);
-
- /* Subtract the journal header */
- u64s -= sizeof(struct jset) / sizeof(u64);
- /*
- * Btree roots, prio pointers don't get added until right before we do
- * the write:
- */
- u64s -= journal_entry_u64s_reserve(buf);
- u64s = max_t(ssize_t, 0L, u64s);
-
- BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
-
- if (u64s <= le32_to_cpu(buf->data->u64s))
- return 0;
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- v = atomic64_read(&j->reservations.counter);
- do {
- old.v = new.v = v;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EIO;
-
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- if (j->res_get_blocked_start)
- __bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
- mod_delayed_work(system_freezable_wq,
- &j->write_work,
- msecs_to_jiffies(j->write_delay_ms));
- journal_wake(j);
- return 1;
-}
-
-void bch2_journal_start(struct bch_fs *c)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl;
- u64 new_seq = 0;
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- new_seq = max(new_seq, bl->seq);
-
- spin_lock(&j->lock);
-
- set_bit(JOURNAL_STARTED, &j->flags);
-
- while (journal_cur_seq(j) < new_seq)
- journal_pin_new_entry(j, 0);
-
- /*
- * journal_buf_switch() only inits the next journal entry when it
- * closes an open journal entry - the very first journal entry gets
- * initialized here:
- */
- journal_pin_new_entry(j, 1);
- bch2_journal_buf_init(j);
-
- spin_unlock(&j->lock);
-
- /*
- * Adding entries to the next journal entry before allocating space on
- * disk for the next journal entry - this is ok, because these entries
- * only have to go down with the next journal entry we write:
- */
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (!bl->written) {
- bch2_journal_add_entry_noreservation(journal_cur_buf(j),
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0, &bl->seq, 1);
-
- journal_pin_add_entry(j,
- &fifo_peek_back(&j->pin),
- &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_pin_list =
- journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- for_each_jset_key(k, _n, entry, &i->j) {
-
- if (entry->btree_id == BTREE_ID_ALLOC) {
- /*
- * allocation code handles replay for
- * BTREE_ID_ALLOC keys:
- */
- ret = bch2_alloc_replay_key(c, k->k.p);
- } else {
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
-
- ret = bch2_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- }
-
- if (atomic_dec_and_test(&j->replay_pin_list->count))
- journal_wake(j);
- }
-
- j->replay_pin_list = NULL;
-
- bch2_journal_set_replay_done(j);
- ret = bch2_journal_flush_all_pins(j);
-err:
- bch2_journal_entries_free(list);
- return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- struct journal *j = &c->journal;
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- struct disk_reservation disk_res = { 0, 0 };
- struct closure cl;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- */
-
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0))
- return -ENOSPC;
-
- mutex_lock(&c->sb_lock);
-
- ret = -ENOMEM;
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
- goto err;
-
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- goto err;
-
- spin_lock(&j->lock);
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- spin_unlock(&j->lock);
-
- while (ja->nr < nr) {
- struct open_bucket *ob;
- size_t bucket;
- int ob_idx;
-
- ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
- if (ob_idx < 0) {
- if (!closure_wait(&c->freelist_wait, &cl))
- closure_sync(&cl);
- continue;
- }
-
- ob = c->open_buckets + ob_idx;
- bucket = sector_to_bucket(ca, ob->ptr.offset);
-
- spin_lock(&j->lock);
- __array_insert_item(ja->buckets, ja->nr, ja->last_idx);
- __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
- __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
-
- ja->buckets[ja->last_idx] = bucket;
- ja->bucket_seq[ja->last_idx] = 0;
- journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
-
- if (ja->last_idx < ja->nr) {
- if (ja->cur_idx >= ja->last_idx)
- ja->cur_idx++;
- ja->last_idx++;
- }
- ja->nr++;
- spin_unlock(&j->lock);
-
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), 0);
-
- bch2_open_bucket_put(c, ob);
- }
-
- bch2_write_super(c);
-
- ret = 0;
-err:
- mutex_unlock(&c->sb_lock);
-
- kfree(new_bucket_seq);
- kfree(new_buckets);
- bch2_disk_reservation_put(c, &disk_res);
-
- if (!ret)
- bch2_dev_allocator_add(c, ca);
-
- closure_sync(&cl);
-
- return ret;
-}
-
-int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
-{
- unsigned nr;
-
- if (dynamic_fault("bcachefs:add:journal_alloc"))
- return -ENOMEM;
-
- /*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
-
- return bch2_set_nr_journal_buckets(c, ca, nr);
-}
-
-/* Journalling */
-
-/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- journal_wake(j);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- BUG_ON(journal_pin_active(pin));
- BUG_ON(!atomic_read(&pin_list->count));
-
- atomic_inc(&pin_list->count);
- pin->pin_list = pin_list;
- pin->flush = flush_fn;
-
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
-
- /*
- * If the journal is currently full, we might want to call flush_fn
- * immediately:
- */
- journal_wake(j);
-}
-
-static void journal_pin_add_entry(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add(struct journal *j,
- struct journal_res *res,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- struct journal_entry_pin_list *pin_list = res->ref
- ? journal_seq_pin(j, res->seq)
- : j->replay_pin_list;
-
- spin_lock(&j->lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock(&j->lock);
-}
-
-static inline void __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list = pin->pin_list;
-
- if (!journal_pin_active(pin))
- return;
-
- pin->pin_list = NULL;
- list_del_init(&pin->list);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- */
- if (atomic_dec_and_test(&pin_list->count) &&
- pin_list == &fifo_peek_front(&j->pin))
- journal_reclaim_fast(j);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- spin_lock(&j->lock);
- __journal_pin_drop(j, pin);
- spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock(&j->lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- journal_pin_seq(j, src_pin->pin_list) <
- journal_pin_seq(j, pin->pin_list))) {
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
- }
-
- spin_unlock(&j->lock);
-}
-
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret;
- u64 iter;
-
- /* no need to iterate over empty fifo entries: */
- journal_reclaim_fast(j);
-
- fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (iter > seq_to_flush)
- break;
-
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret) {
- /* must be list_del_init(), see bch2_journal_pin_drop() */
- list_move(&ret->list, &pin_list->flushed);
- *seq = iter;
- return ret;
- }
- }
-
- return NULL;
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin *ret;
-
- spin_lock(&j->lock);
- ret = __journal_get_next_pin(j, seq_to_flush, seq);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
- struct journal_entry_pin **pin,
- u64 *pin_seq)
-{
- int ret;
-
- *pin = NULL;
-
- ret = bch2_journal_error(j);
- if (ret)
- return ret;
-
- spin_lock(&j->lock);
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers
- */
- ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
- !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
- journal_last_seq(j) > seq_to_flush ||
- (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_entry_pin *pin;
- u64 pin_seq;
- bool flush;
-
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return 0;
-again:
- wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
- if (pin) {
- /* flushing a journal pin might cause a new one to be added: */
- pin->flush(j, pin, pin_seq);
- goto again;
- }
-
- spin_lock(&j->lock);
- flush = journal_last_seq(j) != j->last_seq_ondisk ||
- (seq_to_flush == U64_MAX && c->btree_roots_dirty);
- spin_unlock(&j->lock);
-
- return flush ? bch2_journal_meta(j) : 0;
-}
-
-int bch2_journal_flush_all_pins(struct journal *j)
-{
- return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
- bool ret;
-
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-/**
- * journal_reclaim_work - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static void journal_reclaim_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(to_delayed_work(work),
- struct bch_fs, journal.reclaim_work);
- struct journal *j = &c->journal;
- struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq, seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
-
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
- for_each_rw_member(ca, c, iter) {
- struct journal_device *ja = &ca->journal;
-
- if (!ja->nr)
- continue;
-
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
-
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- journal_wake(j);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
- bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
- seq_to_flush = max_t(u64, seq_to_flush,
- ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
- }
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
-
- /* Also flush if the pin fifo is more than half full */
- spin_lock(&j->lock);
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush, &seq))) {
- __set_current_state(TASK_RUNNING);
- pin->flush(j, pin, seq);
- need_flush = false;
-
- j->last_flushed = jiffies;
- }