- void *n;
-
- /* the bios are sized for this many pages, max: */
- if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -ENOMEM;
-
- new_size = roundup_pow_of_two(new_size);
- n = kvpmalloc(new_size, GFP_KERNEL);
- if (!n)
- return -ENOMEM;
-
- kvpfree(b->data, b->size);
- b->data = n;
- b->size = new_size;
- return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
- struct journal_read_buf *buf,
- struct journal_list *jlist,
- unsigned bucket, u64 *seq, bool *entries_found)
-{
- struct bch_fs *c = ca->fs;
- struct journal_device *ja = &ca->journal;
- struct bio *bio = ja->bio;
- struct jset *j = NULL;
- unsigned sectors, sectors_read = 0;
- u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
- end = offset + ca->mi.bucket_size;
- bool saw_bad = false;
- int ret = 0;
-
- pr_debug("reading %u", bucket);
-
- while (offset < end) {
- if (!sectors_read) {
-reread: sectors_read = min_t(unsigned,
- end - offset, buf->size >> 9);
-
- bio_reset(bio);
- bio->bi_bdev = ca->disk_sb.bdev;
- bio->bi_iter.bi_sector = offset;
- bio->bi_iter.bi_size = sectors_read << 9;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch2_bio_map(bio, buf->data);
-
- ret = submit_bio_wait(bio);
-
- if (bch2_dev_io_err_on(ret, ca,
- "journal read from sector %llu",
- offset) ||
- bch2_meta_read_fault("journal"))
- return -EIO;
-
- j = buf->data;
- }
-
- ret = journal_entry_validate(c, j, offset,
- end - offset, sectors_read,
- READ);
- switch (ret) {
- case BCH_FSCK_OK:
- break;
- case JOURNAL_ENTRY_REREAD:
- if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
- vstruct_bytes(j));
- if (ret)
- return ret;
- }
- goto reread;
- case JOURNAL_ENTRY_NONE:
- if (!saw_bad)
- return 0;
- sectors = c->opts.block_size;
- goto next_block;
- case JOURNAL_ENTRY_BAD:
- saw_bad = true;
- sectors = c->opts.block_size;
- goto next_block;
- default:
- return ret;
- }
-
- /*
- * This happens sometimes if we don't have discards on -
- * when we've partially overwritten a bucket with new
- * journal entries. We don't need the rest of the
- * bucket:
- */
- if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
-
- ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
- mutex_lock(&jlist->lock);
- ret = journal_entry_add(c, ca, jlist, j);
- mutex_unlock(&jlist->lock);
-
- switch (ret) {
- case JOURNAL_ENTRY_ADD_OK:
- *entries_found = true;
- break;
- case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
- break;
- default:
- return ret;
- }
-
- if (le64_to_cpu(j->seq) > *seq)
- *seq = le64_to_cpu(j->seq);
-
- sectors = vstruct_sectors(j, c->block_bits);
-next_block:
- pr_debug("next");
- offset += sectors;
- sectors_read -= sectors;
- j = ((void *) j) + (sectors << 9);
- }
-
- return 0;
-}
-
-static void bch2_journal_read_device(struct closure *cl)
-{
-#define read_bucket(b) \
- ({ \
- bool entries_found = false; \
- ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \
- &entries_found); \
- if (ret) \
- goto err; \
- __set_bit(b, bitmap); \
- entries_found; \
- })
-
- struct journal_device *ja =
- container_of(cl, struct journal_device, read);
- struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
- struct journal_list *jlist =
- container_of(cl->parent, struct journal_list, cl);
- struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
- struct journal_read_buf buf = { NULL, 0 };
-
- DECLARE_BITMAP(bitmap, ja->nr);
- unsigned i, l, r;
- u64 seq = 0;
- int ret;
-
- if (!ja->nr)
- goto out;
-
- bitmap_zero(bitmap, ja->nr);
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
- if (ret)
- goto err;
-
- pr_debug("%u journal buckets", ja->nr);
-
- /*
- * If the device supports discard but not secure discard, we can't do
- * the fancy fibonacci hash/binary search because the live journal
- * entries might not form a contiguous range:
- */
- for (i = 0; i < ja->nr; i++)
- read_bucket(i);
- goto search_done;
-
- if (!blk_queue_nonrot(q))
- goto linear_scan;
-
- /*
- * Read journal buckets ordered by golden ratio hash to quickly
- * find a sequence of buckets with valid journal entries
- */
- for (i = 0; i < ja->nr; i++) {
- l = (i * 2654435769U) % ja->nr;
-
- if (test_bit(l, bitmap))
- break;
-
- if (read_bucket(l))
- goto bsearch;
- }
-
- /*
- * If that fails, check all the buckets we haven't checked
- * already
- */
- pr_debug("falling back to linear search");
-linear_scan:
- for (l = find_first_zero_bit(bitmap, ja->nr);
- l < ja->nr;
- l = find_next_zero_bit(bitmap, ja->nr, l + 1))
- if (read_bucket(l))
- goto bsearch;
-
- /* no journal entries on this device? */
- if (l == ja->nr)
- goto out;
-bsearch:
- /* Binary search */
- r = find_next_bit(bitmap, ja->nr, l + 1);
- pr_debug("starting binary search, l %u r %u", l, r);
-
- while (l + 1 < r) {
- unsigned m = (l + r) >> 1;
- u64 cur_seq = seq;
-
- read_bucket(m);
-
- if (cur_seq != seq)
- l = m;
- else
- r = m;
- }
-
-search_done:
- /*
- * Find the journal bucket with the highest sequence number:
- *
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- seq = 0;
-
- for (i = 0; i < ja->nr; i++)
- if (ja->bucket_seq[i] >= seq &&
- ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
- /*
- * When journal_next_bucket() goes to allocate for
- * the first time, it'll use the bucket after
- * ja->cur_idx
- */
- ja->cur_idx = i;
- seq = ja->bucket_seq[i];
- }
-
- /*
- * Set last_idx to indicate the entire journal is full and needs to be
- * reclaimed - journal reclaim will immediately reclaim whatever isn't
- * pinned when it first runs:
- */
- ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
- /*
- * Read buckets in reverse order until we stop finding more journal
- * entries:
- */
- for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
- i != ja->cur_idx;
- i = (i + ja->nr - 1) % ja->nr)
- if (!test_bit(i, bitmap) &&
- !read_bucket(i))
- break;
-out:
- kvpfree(buf.data, buf.size);
- percpu_ref_put(&ca->io_ref);
- closure_return(cl);
-err:
- mutex_lock(&jlist->lock);
- jlist->ret = ret;
- mutex_unlock(&jlist->lock);
- goto out;
-#undef read_bucket
-}
-
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
-static int journal_seq_blacklist_read(struct journal *j,
- struct journal_replay *i,
- struct journal_entry_pin_list *p)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *entry;
- struct journal_seq_blacklist *bl;
- u64 seq;
-
- for_each_jset_entry_type(entry, &i->j,
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
- struct jset_entry_blacklist *bl_entry =
- container_of(entry, struct jset_entry_blacklist, entry);
- seq = le64_to_cpu(bl_entry->seq);
-
- bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
- bl = bch2_journal_seq_blacklisted_new(j, seq);
- if (!bl)
- return -ENOMEM;
-
- journal_pin_add_entry(j, p, &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- return 0;
-}
-
-static inline bool journal_has_keys(struct list_head *list)
-{
- struct journal_replay *i;
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
-
- list_for_each_entry(i, list, list)
- for_each_jset_key(k, _n, entry, &i->j)
- return true;
-
- return false;
-}
-
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct journal_list jlist;
- struct journal_replay *i;
- struct journal_entry_pin_list *p;
- struct bch_dev *ca;
- u64 cur_seq, end_seq;
- unsigned iter, keys = 0, entries = 0;
- int ret = 0;
-
- closure_init_stack(&jlist.cl);
- mutex_init(&jlist.lock);
- jlist.head = list;
- jlist.ret = 0;
-
- for_each_readable_member(ca, c, iter) {
- percpu_ref_get(&ca->io_ref);
- closure_call(&ca->journal.read,
- bch2_journal_read_device,
- system_unbound_wq,
- &jlist.cl);
- }
-
- closure_sync(&jlist.cl);
-
- if (jlist.ret)
- return jlist.ret;
-
- if (list_empty(list)){
- bch_err(c, "no journal entries found");
- return BCH_FSCK_REPAIR_IMPOSSIBLE;
- }
-
- fsck_err_on(c->sb.clean && journal_has_keys(list), c,
- "filesystem marked clean but journal has keys to replay");
-
- list_for_each_entry(i, list, list) {
- ret = journal_entry_validate_entries(c, &i->j, READ);
- if (ret)
- goto fsck_err;
-
- if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas_devlist(c, &i->devs,
- BCH_DATA_JOURNAL), c,
- "superblock not marked as containing replicas (type %u)",
- BCH_DATA_JOURNAL)) {
- ret = bch2_check_mark_super_devlist(c, &i->devs,
- BCH_DATA_JOURNAL);
- if (ret)
- return ret;
- }
- }
-
- i = list_last_entry(list, struct journal_replay, list);
-
- unfixable_fsck_err_on(le64_to_cpu(i->j.seq) -
- le64_to_cpu(i->j.last_seq) + 1 > j->pin.size, c,
- "too many journal entries open for refcount fifo");
-
- atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
- j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
- j->pin.front = le64_to_cpu(i->j.last_seq);
- j->pin.back = le64_to_cpu(i->j.seq) + 1;
-
- BUG_ON(last_seq(j) != le64_to_cpu(i->j.last_seq));
- BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
- &fifo_peek_back(&j->pin));
-
- fifo_for_each_entry_ptr(p, &j->pin, iter) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
- p->devs.nr = 0;
- }
-
- mutex_lock(&j->blacklist_lock);
-
- list_for_each_entry(i, list, list) {
- p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
-
- if (journal_seq_blacklist_read(j, i, p)) {
- mutex_unlock(&j->blacklist_lock);
- return -ENOMEM;
- }
- }
-
- mutex_unlock(&j->blacklist_lock);
-
- cur_seq = last_seq(j);
- end_seq = le64_to_cpu(list_last_entry(list,
- struct journal_replay, list)->j.seq);
-
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
- bool blacklisted;
-
- mutex_lock(&j->blacklist_lock);
- while (cur_seq < le64_to_cpu(i->j.seq) &&
- journal_seq_blacklist_find(j, cur_seq))
- cur_seq++;
-
- blacklisted = journal_seq_blacklist_find(j,
- le64_to_cpu(i->j.seq));
- mutex_unlock(&j->blacklist_lock);
-
- fsck_err_on(blacklisted, c,
- "found blacklisted journal entry %llu",
- le64_to_cpu(i->j.seq));
-
- fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- cur_seq, le64_to_cpu(i->j.seq) - 1,
- last_seq(j), end_seq);
-
- cur_seq = le64_to_cpu(i->j.seq) + 1;
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- entries++;
- }
-
- bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
- keys, entries, (u64) atomic64_read(&j->seq));
-fsck_err:
- return ret;
-}
-
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
- struct bkey_i *k, *n;
- struct jset_entry *j;
- struct journal_replay *r;
- int ret;
-
- list_for_each_entry(r, list, list)
- for_each_jset_key(k, n, j, &r->j) {
- enum bkey_type type = bkey_type(j->level, j->btree_id);
- struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
- if (btree_type_has_ptrs(type)) {
- ret = bch2_btree_mark_key_initial(c, type, k_s_c);
- if (ret)
- return ret;
- }
- }
-
- return 0;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
- return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
-{
- struct journal_buf *w = journal_prev_buf(j);
-
- atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- __bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-#if 0
- closure_call(&j->io, journal_write, NULL, NULL);
-#else
- /* Shut sparse up: */
- closure_init(&j->io, NULL);
- set_closure_fn(&j->io, journal_write, NULL);
- journal_write(&j->io);
-#endif
-}
-
-static void __journal_entry_new(struct journal *j, int count)
-{
- struct journal_entry_pin_list *p = fifo_push_ref(&j->pin);
-
- /*
- * The fifo_push() needs to happen at the same time as j->seq is
- * incremented for last_seq() to be calculated correctly
- */
- atomic64_inc(&j->seq);
-
- BUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
- &fifo_peek_back(&j->pin));
-
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, count);
- p->devs.nr = 0;
-}
-
-static void __bch2_journal_next_entry(struct journal *j)
-{
- struct journal_buf *buf;
-
- __journal_entry_new(j, 1);
-
- buf = journal_cur_buf(j);
- memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
- memset(buf->data, 0, sizeof(*buf->data));
- buf->data->seq = cpu_to_le64(atomic64_read(&j->seq));
- buf->data->u64s = 0;
-}
-
-static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
-{
- return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
-}
-
-static enum {
- JOURNAL_ENTRY_ERROR,
- JOURNAL_ENTRY_INUSE,
- JOURNAL_ENTRY_CLOSED,
- JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct journal_buf *buf;
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- lockdep_assert_held(&j->lock);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return JOURNAL_ENTRY_CLOSED;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return JOURNAL_ENTRY_ERROR;
-
- if (new.prev_buf_unwritten)
- return JOURNAL_ENTRY_INUSE;
-
- /*
- * avoid race between setting buf->data->u64s and
- * journal_res_put starting write:
- */
- journal_state_inc(&new);
-
- new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
- new.idx++;
- new.prev_buf_unwritten = 1;
-
- BUG_ON(journal_state_count(new, new.idx));
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- journal_reclaim_fast(j);
-
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
- buf = &j->buf[old.idx];
- buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- buf->data->last_seq = cpu_to_le64(last_seq(j));
-
- j->prev_buf_sectors =
- vstruct_blocks_plus(buf->data, c->block_bits,
- journal_entry_u64s_reserve(buf)) *
- c->opts.block_size;
-
- BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
-
- __bch2_journal_next_entry(j);
-
- cancel_delayed_work(&j->write_work);
- spin_unlock(&j->lock);
-
- if (c->bucket_journal_seq > 1 << 14) {
- c->bucket_journal_seq = 0;
- bch2_bucket_seq_cleanup(c);
- }
-
- /* ugh - might be called from __journal_res_get() under wait_event() */
- __set_current_state(TASK_RUNNING);
- bch2_journal_buf_put(j, old.idx, need_write_just_set);
-
- return JOURNAL_UNLOCKED;
-}
-
-void bch2_journal_halt(struct journal *j)
-{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
-
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
-
- wake_up(&j->wait);
- closure_wake_up(&journal_cur_buf(j)->wait);
- closure_wake_up(&journal_prev_buf(j)->wait);
-}
-
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct bch_dev *ca)
-{
- struct journal_device *ja = &ca->journal;
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Hack to avoid a deadlock during journal replay:
- * journal replay might require setting a new btree
- * root, which requires writing another journal entry -
- * thus, if the journal is full (and this happens when
- * replaying the first journal bucket's entries) we're
- * screwed.
- *
- * So don't let the journal fill up unless we're in
- * replay:
- */
- if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- available = max((int) available - 2, 0);
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (ja->bucket_seq[ja->last_idx] >= last_seq(j))
- available = max((int) available - 1, 0);
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-static int journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
- unsigned sectors_available = UINT_MAX;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_required = 0;
-
- if (!ja->nr)
- continue;
-
- sectors_available = min_t(unsigned, sectors_available,
- ca->mi.bucket_size);
-
- /*
- * Note that we don't allocate the space for a journal entry
- * until we write it out - thus, if we haven't started the write
- * for the previous entry we have to make sure we have space for
- * it too:
- */
- if (bch2_extent_has_device(e.c, ca->dev_idx)) {
- if (j->prev_buf_sectors > ja->sectors_free)
- buckets_required++;
-
- if (j->prev_buf_sectors + sectors_available >
- ja->sectors_free)
- buckets_required++;
- } else {
- if (j->prev_buf_sectors + sectors_available >
- ca->mi.bucket_size)
- buckets_required++;
-
- buckets_required++;
- }
-
- if (journal_dev_buckets_available(j, ca) >= buckets_required)
- nr_devs++;
- nr_online++;
- }
- rcu_read_unlock();
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
-/*
- * should _only_ called from journal_res_get() - when we actually want a
- * journal reservation - journal entry is open means journal is dirty:
- */
-static int journal_entry_open(struct journal *j)
-{
- struct journal_buf *buf = journal_cur_buf(j);
- ssize_t u64s;
- int ret = 0, sectors;
-
- lockdep_assert_held(&j->lock);
- BUG_ON(journal_entry_is_open(j));
-
- if (!fifo_free(&j->pin))
- return 0;
-
- sectors = journal_entry_sectors(j);
- if (sectors <= 0)
- return sectors;
-
- buf->disk_sectors = sectors;
-
- sectors = min_t(unsigned, sectors, buf->size >> 9);
- j->cur_buf_sectors = sectors;
-
- u64s = (sectors << 9) / sizeof(u64);
-
- /* Subtract the journal header */
- u64s -= sizeof(struct jset) / sizeof(u64);
- /*
- * Btree roots, prio pointers don't get added until right before we do
- * the write:
- */
- u64s -= journal_entry_u64s_reserve(buf);
- u64s = max_t(ssize_t, 0L, u64s);
-
- BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
-
- if (u64s > le32_to_cpu(buf->data->u64s)) {
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- /*
- * Must be set before marking the journal entry as open:
- */
- j->cur_entry_u64s = u64s;
-
- do {
- old.v = new.v = v;
-
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return false;
-
- /* Handle any already added entries */
- new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
- ret = 1;
-
- wake_up(&j->wait);
-
- if (j->res_get_blocked_start) {
- __bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
- }
-
- mod_delayed_work(system_freezable_wq,
- &j->write_work,
- msecs_to_jiffies(j->write_delay_ms));
- }
-
- return ret;
-}
-
-void bch2_journal_start(struct bch_fs *c)
-{
- struct journal *j = &c->journal;
- struct journal_seq_blacklist *bl;
- u64 new_seq = 0;
-
- list_for_each_entry(bl, &j->seq_blacklist, list)
- new_seq = max(new_seq, bl->seq);
-
- spin_lock(&j->lock);
-
- set_bit(JOURNAL_STARTED, &j->flags);
-
- while (atomic64_read(&j->seq) < new_seq)
- __journal_entry_new(j, 0);
-
- /*
- * journal_buf_switch() only inits the next journal entry when it
- * closes an open journal entry - the very first journal entry gets
- * initialized here:
- */
- __bch2_journal_next_entry(j);
-
- /*
- * Adding entries to the next journal entry before allocating space on
- * disk for the next journal entry - this is ok, because these entries
- * only have to go down with the next journal entry we write:
- */
- list_for_each_entry(bl, &j->seq_blacklist, list)
- if (!bl->written) {
- bch2_journal_add_entry_noreservation(journal_cur_buf(j),
- JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
- 0, 0, &bl->seq, 1);
-
- journal_pin_add_entry(j,
- &fifo_peek_back(&j->pin),
- &bl->pin,
- journal_seq_blacklist_flush);
- bl->written = true;
- }
-
- spin_unlock(&j->lock);
-
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0, did_replay = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_pin_list =
- journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- for_each_jset_key(k, _n, entry, &i->j) {
- struct disk_reservation disk_res;
-
- if (entry->btree_id == BTREE_ID_ALLOC) {
- /*
- * allocation code handles replay for
- * BTREE_ID_ALLOC keys:
- */
- ret = bch2_alloc_replay_key(c, k->k.p);
- } else {
-
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0));
-
- ret = bch2_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
- bch2_disk_reservation_put(c, &disk_res);
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- did_replay = true;
- }
-
- if (atomic_dec_and_test(&j->replay_pin_list->count))
- wake_up(&j->wait);
- }
-
- j->replay_pin_list = NULL;
-
- bch2_journal_set_replay_done(j);
-
- if (did_replay) {
- bch2_journal_flush_pins(&c->journal, U64_MAX);
-
- /*
- * Write a new journal entry _before_ we start journalling new data -
- * otherwise, we could end up with btree node bsets with journal seqs
- * arbitrarily far in the future vs. the most recently written journal
- * entry on disk, if we crash before writing the next journal entry:
- */
- ret = bch2_journal_meta(j);
- if (ret) {
- bch_err(c, "journal replay: error %d flushing journal", ret);
- goto err;
- }
- }
-err:
- bch2_journal_entries_free(list);
- return ret;
-}
-
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
-{
- struct journal *j = &c->journal;
- struct journal_device *ja = &ca->journal;
- struct bch_sb_field_journal *journal_buckets;
- struct disk_reservation disk_res = { 0, 0 };
- struct closure cl;
- u64 *new_bucket_seq = NULL, *new_buckets = NULL;
- int ret = 0;
-
- closure_init_stack(&cl);
-
- /* don't handle reducing nr of buckets yet: */
- if (nr <= ja->nr)
- return 0;
-
- /*
- * note: journal buckets aren't really counted as _sectors_ used yet, so
- * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
- * when space used goes up without a reservation - but we do need the
- * reservation to ensure we'll actually be able to allocate:
- */
-
- if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 0))
- return -ENOSPC;
-
- mutex_lock(&c->sb_lock);
-
- ret = -ENOMEM;
- new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
- if (!new_buckets || !new_bucket_seq)
- goto err;
-
- journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
- if (!journal_buckets)
- goto err;
-
- spin_lock(&j->lock);
- memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
- memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
- swap(new_buckets, ja->buckets);
- swap(new_bucket_seq, ja->bucket_seq);
- spin_unlock(&j->lock);
-
- while (ja->nr < nr) {
- struct open_bucket *ob;
- size_t bucket;
- int ob_idx;
-
- ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
- if (ob_idx < 0) {
- if (!closure_wait(&c->freelist_wait, &cl))
- closure_sync(&cl);
- continue;
- }
-
- ob = c->open_buckets + ob_idx;
- bucket = sector_to_bucket(ca, ob->ptr.offset);
-
- spin_lock(&j->lock);
- __array_insert_item(ja->buckets, ja->nr, ja->last_idx);
- __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx);
- __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx);
-
- ja->buckets[ja->last_idx] = bucket;
- ja->bucket_seq[ja->last_idx] = 0;
- journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
-
- if (ja->last_idx < ja->nr) {
- if (ja->cur_idx >= ja->last_idx)
- ja->cur_idx++;
- ja->last_idx++;
- }
- ja->nr++;
- spin_unlock(&j->lock);
-
- bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB), 0);
-
- bch2_open_bucket_put(c, ob);
- }
-
- BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
-
- bch2_write_super(c);
-
- ret = 0;
-err:
- mutex_unlock(&c->sb_lock);
-
- kfree(new_bucket_seq);
- kfree(new_buckets);
- bch2_disk_reservation_put(c, &disk_res);
-
- if (!ret)
- bch2_dev_allocator_add(c, ca);
-
- closure_sync(&cl);
-
- return ret;
-}
-
-int bch2_dev_journal_alloc(struct bch_dev *ca)
-{
- unsigned nr;
-
- if (dynamic_fault("bcachefs:add:journal_alloc"))
- return -ENOMEM;
-
- /*
- * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
- * is smaller:
- */
- nr = clamp_t(unsigned, ca->mi.nbuckets >> 8,
- BCH_JOURNAL_BUCKETS_MIN,
- min(1 << 10,
- (1 << 20) / ca->mi.bucket_size));
-
- return bch2_set_nr_journal_buckets(ca->fs, ca, nr);
-}
-
-/* Journalling */
-
-/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- wake_up(&j->wait);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- BUG_ON(journal_pin_active(pin));
-
- atomic_inc(&pin_list->count);
- pin->pin_list = pin_list;
- pin->flush = flush_fn;
-
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
-}
-
-static void journal_pin_add_entry(struct journal *j,
- struct journal_entry_pin_list *pin_list,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
-}
-
-void bch2_journal_pin_add(struct journal *j,
- struct journal_res *res,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- struct journal_entry_pin_list *pin_list = res->ref
- ? journal_seq_pin(j, res->seq)
- : j->replay_pin_list;
-
- spin_lock_irq(&j->pin_lock);
- __journal_pin_add(j, pin_list, pin, flush_fn);
- spin_unlock_irq(&j->pin_lock);
-}
-
-static inline bool __journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- struct journal_entry_pin_list *pin_list = pin->pin_list;
-
- pin->pin_list = NULL;
-
- /* journal_reclaim_work() might have already taken us off the list */
- if (!list_empty_careful(&pin->list))
- list_del_init(&pin->list);
-
- return atomic_dec_and_test(&pin_list->count);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
- struct journal_entry_pin *pin)
-{
- unsigned long flags;
- bool wakeup = false;
-
- spin_lock_irqsave(&j->pin_lock, flags);
- if (journal_pin_active(pin))
- wakeup = __journal_pin_drop(j, pin);
- spin_unlock_irqrestore(&j->pin_lock, flags);
-
- /*
- * Unpinning a journal entry make make journal_next_bucket() succeed, if
- * writing a new last_seq will now make another bucket available:
- *
- * Nested irqsave is expensive, don't do the wakeup with lock held:
- */
- if (wakeup)
- wake_up(&j->wait);
-}
-
-void bch2_journal_pin_add_if_older(struct journal *j,
- struct journal_entry_pin *src_pin,
- struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
-{
- spin_lock_irq(&j->pin_lock);
-
- if (journal_pin_active(src_pin) &&
- (!journal_pin_active(pin) ||
- fifo_entry_idx(&j->pin, src_pin->pin_list) <
- fifo_entry_idx(&j->pin, pin->pin_list))) {
- if (journal_pin_active(pin))
- __journal_pin_drop(j, pin);
- __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
- }
-
- spin_unlock_irq(&j->pin_lock);
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
- struct journal_entry_pin_list *pin_list;
- struct journal_entry_pin *ret = NULL;
- unsigned iter;
-
- /* so we don't iterate over empty fifo entries below: */
- if (!atomic_read(&fifo_peek_front(&j->pin).count)) {
- spin_lock(&j->lock);
- journal_reclaim_fast(j);
- spin_unlock(&j->lock);
- }
-
- spin_lock_irq(&j->pin_lock);
- fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
- if (journal_pin_seq(j, pin_list) > seq_to_flush)
- break;
-
- ret = list_first_entry_or_null(&pin_list->list,
- struct journal_entry_pin, list);
- if (ret) {
- /* must be list_del_init(), see bch2_journal_pin_drop() */
- list_move(&ret->list, &pin_list->flushed);
- *seq = journal_pin_seq(j, pin_list);
- break;
- }
- }
- spin_unlock_irq(&j->pin_lock);
-
- return ret;
-}
-
-static bool journal_flush_done(struct journal *j, u64 seq_to_flush)
-{
- bool ret;
-
- spin_lock(&j->lock);
- journal_reclaim_fast(j);
-
- ret = (fifo_used(&j->pin) == 1 &&
- atomic_read(&fifo_peek_front(&j->pin).count) == 1) ||
- last_seq(j) > seq_to_flush;
- spin_unlock(&j->lock);
-
- return ret;
-}
-
-void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
- struct journal_entry_pin *pin;
- u64 pin_seq;
-
- if (!test_bit(JOURNAL_STARTED, &j->flags))
- return;
-
- while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq)))
- pin->flush(j, pin, pin_seq);
-
- /*
- * If journal replay hasn't completed, the unreplayed journal entries
- * hold refs on their corresponding sequence numbers and thus this would
- * deadlock:
- */
- if (!test_bit(JOURNAL_REPLAY_DONE, &j->flags))
- return;
-
- wait_event(j->wait,
- journal_flush_done(j, seq_to_flush) ||
- bch2_journal_error(j));
-}
-
-int bch2_journal_flush_all_pins(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool flush;
-
- bch2_journal_flush_pins(j, U64_MAX);