X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fjournal_types.h;h=8d8c0b3d5a30e7e86dbef1f26db55f3b9a3fd5f1;hb=9690f783569ebeb166dfc1745c0ba0f48db523d0;hp=5eea6579e7ebec54ffb195cb026052b997090af3;hpb=4de98a2712764bceb9e0f67b1ac2f2c7862feb77;p=bcachefs-tools-debian diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 5eea657..8d8c0b3 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_JOURNAL_TYPES_H #define _BCACHEFS_JOURNAL_TYPES_H @@ -8,23 +9,33 @@ #include "super_types.h" #include "fifo.h" -struct journal_res; +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) /* - * We put two of these in struct journal; we used them for writes to the - * journal that are being staged or in flight. + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. */ struct journal_buf { struct jset *data; - BKEY_PADDED(key); + __BKEY_PADDED(key, BCH_REPLICAS_MAX); + struct bch_devs_list devs_written; struct closure_waitlist wait; - - unsigned size; - unsigned disk_sectors; - /* bloom filter: */ - unsigned long has_inode[1024 / sizeof(unsigned long)]; + u64 last_seq; /* copy of data->last_seq */ + long expires; + u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ + unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ + bool separate_flush; }; /* @@ -32,8 +43,15 @@ struct journal_buf { * flushed: */ +enum journal_pin_type { + JOURNAL_PIN_btree, + JOURNAL_PIN_key_cache, + JOURNAL_PIN_other, + JOURNAL_PIN_NR, +}; + struct journal_entry_pin_list { - struct list_head list; + struct list_head list[JOURNAL_PIN_NR]; struct list_head flushed; atomic_t count; struct bch_devs_list devs; @@ -41,30 +59,13 @@ struct journal_entry_pin_list { struct journal; struct journal_entry_pin; -typedef void (*journal_pin_flush_fn)(struct journal *j, +typedef int (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *, u64); struct journal_entry_pin { struct list_head list; journal_pin_flush_fn flush; - struct journal_entry_pin_list *pin_list; -}; - -/* corresponds to a btree node with a blacklisted bset: */ -struct blacklisted_node { - __le64 seq; - enum btree_id btree_id; - struct bpos pos; -}; - -struct journal_seq_blacklist { - struct list_head list; - u64 seq; - bool written; - struct journal_entry_pin pin; - - struct blacklisted_node *entries; - size_t nr_entries; + u64 seq; }; struct journal_res { @@ -75,6 +76,14 @@ struct journal_res { u64 seq; }; +/* + * For reserving space in the journal prior to getting a reservation on a + * particular journal entry: + */ +struct journal_preres { + unsigned u64s; +}; + union journal_res_state { struct { atomic64_t counter; @@ -86,10 +95,28 @@ union journal_res_state { struct { u64 cur_entry_offset:20, - idx:1, - prev_buf_unwritten:1, - buf0_count:21, - buf1_count:21; + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; + }; +}; + +union journal_preres_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 waiting:1, + reserved:31, + remaining:32; }; }; @@ -106,40 +133,108 @@ union journal_res_state { #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) -/* - * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, - * either because something's waiting on the write to complete or because it's - * been dirty too long and the timer's expired. - */ +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; -enum { +enum journal_flags { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_NEED_WRITE, + JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NEED_FLUSH_WRITE, }; +#define JOURNAL_WATERMARKS() \ + x(any) \ + x(copygc) \ + x(reserved) + +enum journal_watermark { +#define x(n) JOURNAL_WATERMARK_##n, + JOURNAL_WATERMARKS() +#undef x +}; + +#define JOURNAL_WATERMARK_MASK 3 + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x +}; + +typedef DARRAY(u64) darray_u64; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ + struct { + + union journal_res_state reservations; + enum journal_watermark watermark; + + union journal_preres_state prereserved; + + } __aligned(SMP_CACHE_BYTES); unsigned long flags; - union journal_res_state reservations; + /* Max size of current journal entry */ unsigned cur_entry_u64s; - unsigned prev_buf_sectors; - unsigned cur_buf_sectors; + unsigned cur_entry_sectors; + + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + + + /* + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ + enum journal_errors cur_entry_error; + unsigned buf_size_want; + /* + * We may queue up some things to be journalled (log messages) before + * the journal has actually started - stash them here: + */ + darray_u64 early_journal_entries; /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. */ - struct journal_buf buf[2]; + struct journal_buf buf[JOURNAL_BUF_NR]; spinlock_t lock; + /* if nonzero, we may not open a new journal entry: */ + unsigned blocked; + /* Used when waiting because the journal was full */ wait_queue_head_t wait; + struct closure_waitlist async_wait; + struct closure_waitlist preres_wait; struct closure io; struct delayed_work write_work; @@ -147,8 +242,12 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; - /* last_seq from the most recent journal entry written */ + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 flushed_seq_ondisk; u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; /* * FIFO of journal entries whose btree updates have not yet been @@ -166,37 +265,57 @@ struct journal { * needed. When all journal entries in the oldest journal bucket are no * longer needed, the bucket can be discarded and reused. */ - DECLARE_FIFO(struct journal_entry_pin_list, pin); - struct journal_entry_pin_list *replay_pin_list; + struct { + u64 front, back, size, mask; + struct journal_entry_pin_list *data; + } pin; + + struct journal_space space[journal_space_nr]; - struct mutex blacklist_lock; - struct list_head seq_blacklist; + u64 replay_journal_seq; + u64 replay_journal_seq_end; - BKEY_PADDED(key); struct write_point wp; spinlock_t err_lock; - struct delayed_work reclaim_work; + struct mutex reclaim_lock; + /* + * Used for waiting until journal reclaim has freed up space in the + * journal: + */ + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + unsigned long next_reclaim; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + bool flush_in_progress_dropped; + wait_queue_head_t pin_flush_wait; - /* protects advancing ja->last_idx: */ - struct mutex reclaim_lock; - unsigned write_delay_ms; - unsigned reclaim_delay_ms; + /* protects advancing ja->discard_idx: */ + struct mutex discard_lock; + bool can_discard; + + unsigned long last_flush_write; u64 res_get_blocked_start; - u64 need_write_time; u64 write_start_time; - struct time_stats *write_time; - struct time_stats *delay_time; - struct time_stats *blocked_time; - struct time_stats *flush_seq_time; + u64 nr_flush_writes; + u64 nr_noflush_writes; + + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *blocked_time; + struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map res_map; #endif -}; +} __aligned(SMP_CACHE_BYTES); /* * Embedded in struct bch_dev. First three fields refer to the array of journal @@ -211,17 +330,15 @@ struct journal_device { unsigned sectors_free; - /* Journal bucket we're currently writing to */ - unsigned cur_idx; - - /* Last journal bucket that still contains an open journal entry */ - /* - * j->lock and j->reclaim_lock must both be held to modify, j->lock - * sufficient to read: + * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: */ - unsigned last_idx; + unsigned discard_idx; /* Next bucket to discard */ + unsigned dirty_idx_ondisk; + unsigned dirty_idx; + unsigned cur_idx; /* Journal bucket we're currently writing to */ unsigned nr; + u64 *buckets; /* Bio for journal reads/writes to this device */ @@ -231,4 +348,11 @@ struct journal_device { struct closure read; }; +/* + * journal_entry_res - reserve space in every journal entry: + */ +struct journal_entry_res { + unsigned u64s; +}; + #endif /* _BCACHEFS_JOURNAL_TYPES_H */