X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fjournal.h;h=011711e99c8d825ec968cf513f82c08a66ecabc5;hb=d320a4e927fd706b34c714b77130965a385ea4fb;hp=9ad82c6081c18524a70fbdc3530e92cd09f46ac4;hpb=e783d814e83b2309930e1f6459212da6da8c8a54;p=bcachefs-tools-debian diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 9ad82c6..011711e 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -1,5 +1,6 @@ -#ifndef _BCACHE_JOURNAL_H -#define _BCACHE_JOURNAL_H +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H /* * THE JOURNAL: @@ -28,8 +29,8 @@ * * Synchronous updates are specified by passing a closure (@flush_cl) to * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter - * down to the journalling code. That closure will will wait on the journal - * write to complete (via closure_wait()). + * down to the journalling code. That closure will wait on the journal write to + * complete (via closure_wait()). * * If the index update wasn't synchronous, the journal entry will be * written out after 10 ms have elapsed, by default (the delay_ms field @@ -112,62 +113,56 @@ #include "journal_types.h" -/* - * Only used for holding the journal entries we read in btree_journal_read() - * during cache_registration - */ -struct journal_replay { - struct list_head list; - struct jset j; -}; - -#define JOURNAL_PIN (32 * 1024) +struct bch_fs; -static inline bool journal_pin_active(struct journal_entry_pin *pin) +static inline void journal_wake(struct journal *j) { - return pin->pin_list != NULL; + wake_up(&j->wait); + closure_wake_up(&j->async_wait); + closure_wake_up(&j->preres_wait); } -static inline struct journal_entry_pin_list * -journal_seq_pin(struct journal *j, u64 seq) +static inline struct journal_buf *journal_cur_buf(struct journal *j) { - return &j->pin.data[(size_t) seq & j->pin.mask]; + return j->buf + j->reservations.idx; } -void bch2_journal_pin_add(struct journal *, struct journal_res *, - struct journal_entry_pin *, journal_pin_flush_fn); -void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -void bch2_journal_pin_add_if_older(struct journal *, - struct journal_entry_pin *, - struct journal_entry_pin *, - journal_pin_flush_fn); -void bch2_journal_flush_pins(struct journal *); +/* Sequence number of oldest dirty journal entry */ -struct closure; -struct bch_fs; -struct keylist; +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} -struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *, - enum btree_id, unsigned *); +static inline u64 journal_cur_seq(struct journal *j) +{ + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); -int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *); + return j->pin.back - 1; +} -u64 bch2_inode_journal_seq(struct journal *, u64); +static inline u64 journal_last_unwritten_seq(struct journal *j) +{ + return j->seq_ondisk + 1; +} static inline int journal_state_count(union journal_res_state s, int idx) { - return idx == 0 ? s.buf0_count : s.buf1_count; + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); } static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; s->buf1_count += s->idx == 1; -} - -static inline void bch2_journal_set_has_inode(struct journal_buf *buf, u64 inum) -{ - set_bit(hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)), buf->has_inode); + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; } /* @@ -179,63 +174,121 @@ static inline unsigned jset_u64s(unsigned u64s) return u64s + sizeof(struct jset_entry) / sizeof(u64); } -static inline void bch2_journal_add_entry_at(struct journal_buf *buf, - const void *data, size_t u64s, - unsigned type, enum btree_id id, - unsigned level, unsigned offset) +static inline int journal_entry_overhead(struct journal *j) { - struct jset_entry *entry = vstruct_idx(buf->data, offset); + return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; +} +static inline struct jset_entry * +bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) +{ + struct jset *jset = buf->data; + struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); + + memset(entry, 0, sizeof(*entry)); entry->u64s = cpu_to_le16(u64s); - entry->btree_id = id; - entry->level = level; - entry->flags = 0; - SET_JOURNAL_ENTRY_TYPE(entry, type); - memcpy_u64s(entry->_data, data, u64s); + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); + + return entry; } -static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, - enum btree_id id, const struct bkey_i *k) +static inline struct jset_entry * +journal_res_entry(struct journal *j, struct journal_res *res) { - struct journal_buf *buf = &j->buf[res->idx]; - unsigned actual = jset_u64s(k->k.u64s); + return vstruct_idx(j->buf[res->idx].data, res->offset); +} - EBUG_ON(!res->ref); - BUG_ON(actual > res->u64s); +static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + unsigned u64s) +{ + entry->u64s = cpu_to_le16(u64s); + entry->btree_id = id; + entry->level = level; + entry->type = type; + entry->pad[0] = 0; + entry->pad[1] = 0; + entry->pad[2] = 0; + return jset_u64s(u64s); +} + +static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + const void *data, unsigned u64s) +{ + unsigned ret = journal_entry_init(entry, type, id, level, u64s); - bch2_journal_set_has_inode(buf, k->k.p.inode); + memcpy_u64s_small(entry->_data, data, u64s); + return ret; +} + +static inline struct jset_entry * +bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, unsigned u64s) +{ + struct jset_entry *entry = journal_res_entry(j, res); + unsigned actual = journal_entry_init(entry, type, id, level, u64s); - bch2_journal_add_entry_at(buf, k, k->k.u64s, - JOURNAL_ENTRY_BTREE_KEYS, id, - 0, res->offset); + EBUG_ON(!res->ref); + EBUG_ON(actual > res->u64s); res->offset += actual; res->u64s -= actual; + return entry; } -void bch2_journal_buf_put_slowpath(struct journal *, bool); +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) + return false; + return true; +} -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, - bool need_write_just_set) +/* + * Drop reference on a buffer index and return true if the count has hit zero. + */ +static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx) { union journal_res_state s; s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, }).v, &j->reservations.counter); + return s; +} + +void bch2_journal_buf_put_final(struct journal *, u64, bool); + +static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; - EBUG_ON(s.idx != idx && !s.prev_buf_unwritten); + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); +} - /* - * Do not initiate a journal write if the journal is in an error state - * (previous journal entry write may have failed) - */ - if (s.idx != idx && - !journal_state_count(s, idx) && - s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL) - bch2_journal_buf_put_slowpath(j, need_write_just_set); +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +{ + union journal_res_state s; + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) { + spin_lock(&j->lock); + bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + spin_unlock(&j->lock); + } } /* @@ -248,28 +301,33 @@ static inline void bch2_journal_res_put(struct journal *j, if (!res->ref) return; - lock_release(&j->res_map, 0, _RET_IP_); + lock_release(&j->res_map, _THIS_IP_); - while (res->u64s) { - bch2_journal_add_entry_at(&j->buf[res->idx], NULL, 0, - JOURNAL_ENTRY_BTREE_KEYS, - 0, 0, res->offset); - res->offset += jset_u64s(0); - res->u64s -= jset_u64s(0); - } + while (res->u64s) + bch2_journal_add_entry(j, res, + BCH_JSET_ENTRY_btree_keys, + 0, 0, 0); - bch2_journal_buf_put(j, res->idx, false); + bch2_journal_buf_put(j, res->idx, res->seq); res->ref = 0; } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, unsigned); + unsigned); + +/* First bits for BCH_WATERMARK: */ +enum journal_res_flags { + __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, + __JOURNAL_RES_GET_CHECK, +}; + +#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) +#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, - unsigned u64s_min, - unsigned u64s_max) + unsigned flags) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -281,51 +339,172 @@ static inline int journal_res_get_fast(struct journal *j, * Check if there is still room in the current journal * entry: */ - if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; - res->offset = old.cur_entry_offset; - res->u64s = min(u64s_max, j->cur_entry_u64s - - old.cur_entry_offset); + EBUG_ON(!journal_state_count(new, new.idx)); + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) + return 0; - journal_state_inc(&new); new.cur_entry_offset += res->u64s; + journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - res->ref = true; - res->idx = new.idx; - res->seq = le64_to_cpu(j->buf[res->idx].data->seq); + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; } static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned u64s, unsigned flags) { int ret; EBUG_ON(res->ref); - EBUG_ON(u64s_max < u64s_min); + EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + + res->u64s = u64s; - if (journal_res_get_fast(j, res, u64s_min, u64s_max)) + if (journal_res_get_fast(j, res, flags)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max); + ret = bch2_journal_res_get_slowpath(j, res, flags); if (ret) return ret; out: - lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); - EBUG_ON(!res->ref); + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, + (flags & JOURNAL_RES_GET_NONBLOCK) != 0, + NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } return 0; } -void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +/* journal_preres: */ + +static inline void journal_set_watermark(struct journal *j) +{ + union journal_preres_state s = READ_ONCE(j->prereserved); + unsigned watermark = BCH_WATERMARK_stripe; + + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); + if (fifo_free(&j->pin) < j->pin.size / 8) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (s.reserved > s.remaining) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); + if (!s.remaining) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); +} + +static inline void bch2_journal_preres_put(struct journal *j, + struct journal_preres *res) +{ + union journal_preres_state s = { .reserved = res->u64s }; + + if (!res->u64s) + return; + + s.v = atomic64_sub_return(s.v, &j->prereserved.counter); + res->u64s = 0; + + if (unlikely(s.waiting)) { + clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), + (unsigned long *) &j->prereserved.v); + closure_wake_up(&j->preres_wait); + } + + if (s.reserved <= s.remaining && j->watermark) + journal_set_watermark(j); +} + +int __bch2_journal_preres_get(struct journal *, + struct journal_preres *, unsigned, unsigned); + +static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags, + bool set_waiting) +{ + int d = new_u64s - res->u64s; + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + int ret; + + do { + old.v = new.v = v; + ret = 0; + + if (watermark == BCH_WATERMARK_reclaim || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; + } else if (set_waiting && !new.waiting) + new.waiting = true; + else + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); + + if (ret) + res->u64s += d; + return ret; +} + +static inline int bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + if (new_u64s <= res->u64s) + return 0; + + if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) + return -BCH_ERR_journal_preres_get_blocked; + + return __bch2_journal_preres_get(j, res, new_u64s, flags); +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); void bch2_journal_flush_async(struct journal *, struct closure *); -void bch2_journal_meta_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); @@ -336,16 +515,7 @@ static inline int bch2_journal_error(struct journal *j) ? -EIO : 0; } -static inline bool journal_flushes_device(struct bch_dev *ca) -{ - return true; -} - -void bch2_journal_start(struct bch_fs *); -int bch2_journal_mark(struct bch_fs *, struct list_head *); -void bch2_journal_entries_free(struct list_head *); -int bch2_journal_read(struct bch_fs *, struct list_head *); -int bch2_journal_replay(struct bch_fs *, struct list_head *); +struct bch_dev; static inline void bch2_journal_set_replay_done(struct journal *j) { @@ -353,23 +523,27 @@ static inline void bch2_journal_set_replay_done(struct journal *j) set_bit(JOURNAL_REPLAY_DONE, &j->flags); } -ssize_t bch2_journal_print_debug(struct journal *, char *); +void bch2_journal_unblock(struct journal *); +void bch2_journal_block(struct journal *); -int bch2_dev_journal_alloc(struct bch_dev *); +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); -static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -{ - return j - ? (__le64 *) vstruct_end(&j->field) - j->buckets - : 0; -} +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_fs_journal_alloc(struct bch_fs *); -int bch2_journal_move(struct bch_dev *); +void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); +int bch2_fs_journal_start(struct journal *, u64); + void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); int bch2_fs_journal_init(struct journal *); -#endif /* _BCACHE_JOURNAL_H */ +#endif /* _BCACHEFS_JOURNAL_H */