]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/journal_types.h
Disable pristine-tar option in gbp.conf, since there is no pristine-tar branch.
[bcachefs-tools-debian] / libbcachefs / journal_types.h
index 87f378a6ac4fff2358e603931e6110272c78d4f7..011f7a0d4ebd8cd1b88c2a9c483d33138aaff592 100644 (file)
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _BCACHEFS_JOURNAL_TYPES_H
 #define _BCACHEFS_JOURNAL_TYPES_H
 
@@ -8,21 +9,39 @@
 #include "super_types.h"
 #include "fifo.h"
 
-struct journal_res;
+#define JOURNAL_BUF_BITS       2
+#define JOURNAL_BUF_NR         (1U << JOURNAL_BUF_BITS)
+#define JOURNAL_BUF_MASK       (JOURNAL_BUF_NR - 1)
 
 /*
- * We put two of these in struct journal; we used them for writes to the
- * journal that are being staged or in flight.
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
+ * the journal that are being staged or in flight.
  */
 struct journal_buf {
+       struct closure          io;
        struct jset             *data;
 
-       struct closure_waitlist wait;
+       __BKEY_PADDED(key, BCH_REPLICAS_MAX);
+       struct bch_devs_list    devs_written;
 
-       unsigned                size;
-       unsigned                disk_sectors;
-       /* bloom filter: */
-       unsigned long           has_inode[1024 / sizeof(unsigned long)];
+       struct closure_waitlist wait;
+       u64                     last_seq;       /* copy of data->last_seq */
+       long                    expires;
+       u64                     flush_time;
+
+       unsigned                buf_size;       /* size in bytes of @data */
+       unsigned                sectors;        /* maximum size for current entry */
+       unsigned                disk_sectors;   /* maximum size entry could have been, if
+                                                  buf_size was bigger */
+       unsigned                u64s_reserved;
+       bool                    noflush:1;      /* write has already been kicked off, and was noflush */
+       bool                    must_flush:1;   /* something wants a flush */
+       bool                    separate_flush:1;
+       bool                    need_flush_to_write_buffer:1;
+       bool                    write_started:1;
+       bool                    write_allocated:1;
+       bool                    write_done:1;
+       u8                      idx;
 };
 
 /*
@@ -30,8 +49,15 @@ struct journal_buf {
  * flushed:
  */
 
+enum journal_pin_type {
+       JOURNAL_PIN_btree,
+       JOURNAL_PIN_key_cache,
+       JOURNAL_PIN_other,
+       JOURNAL_PIN_NR,
+};
+
 struct journal_entry_pin_list {
-       struct list_head                list;
+       struct list_head                list[JOURNAL_PIN_NR];
        struct list_head                flushed;
        atomic_t                        count;
        struct bch_devs_list            devs;
@@ -39,30 +65,13 @@ struct journal_entry_pin_list {
 
 struct journal;
 struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
                                struct journal_entry_pin *, u64);
 
 struct journal_entry_pin {
        struct list_head                list;
        journal_pin_flush_fn            flush;
-       struct journal_entry_pin_list   *pin_list;
-};
-
-/* corresponds to a btree node with a blacklisted bset: */
-struct blacklisted_node {
-       __le64                  seq;
-       enum btree_id           btree_id;
-       struct bpos             pos;
-};
-
-struct journal_seq_blacklist {
-       struct list_head        list;
-       u64                     seq;
-       bool                    written;
-       struct journal_entry_pin pin;
-
-       struct blacklisted_node *entries;
-       size_t                  nr_entries;
+       u64                             seq;
 };
 
 struct journal_res {
@@ -84,10 +93,12 @@ union journal_res_state {
 
        struct {
                u64             cur_entry_offset:20,
-                               idx:1,
-                               prev_buf_unwritten:1,
-                               buf0_count:21,
-                               buf1_count:21;
+                               idx:2,
+                               unwritten_idx:2,
+                               buf0_count:10,
+                               buf1_count:10,
+                               buf2_count:10,
+                               buf3_count:10;
        };
 };
 
@@ -104,50 +115,119 @@ union journal_res_state {
 #define JOURNAL_ENTRY_CLOSED_VAL       (JOURNAL_ENTRY_OFFSET_MAX - 1)
 #define JOURNAL_ENTRY_ERROR_VAL                (JOURNAL_ENTRY_OFFSET_MAX)
 
-/*
- * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
- * either because something's waiting on the write to complete or because it's
- * been dirty too long and the timer's expired.
- */
+struct journal_space {
+       /* Units of 512 bytes sectors: */
+       unsigned        next_entry; /* How big the next journal entry can be */
+       unsigned        total;
+};
+
+enum journal_space_from {
+       journal_space_discarded,
+       journal_space_clean_ondisk,
+       journal_space_clean,
+       journal_space_total,
+       journal_space_nr,
+};
 
-enum {
+enum journal_flags {
        JOURNAL_REPLAY_DONE,
        JOURNAL_STARTED,
-       JOURNAL_NEED_WRITE,
+       JOURNAL_MAY_SKIP_FLUSH,
+       JOURNAL_NEED_FLUSH_WRITE,
+};
+
+/* Reasons we may fail to get a journal reservation: */
+#define JOURNAL_ERRORS()               \
+       x(ok)                           \
+       x(retry)                        \
+       x(blocked)                      \
+       x(max_in_flight)                \
+       x(journal_full)                 \
+       x(journal_pin_full)             \
+       x(journal_stuck)                \
+       x(insufficient_devices)
+
+enum journal_errors {
+#define x(n)   JOURNAL_ERR_##n,
+       JOURNAL_ERRORS()
+#undef x
+};
+
+typedef DARRAY(u64)            darray_u64;
+
+struct journal_bio {
+       struct bch_dev          *ca;
+       unsigned                buf_idx;
+
+       struct bio              bio;
 };
 
 /* Embedded in struct bch_fs */
 struct journal {
        /* Fastpath stuff up front: */
+       struct {
+
+       union journal_res_state reservations;
+       enum bch_watermark      watermark;
+
+       } __aligned(SMP_CACHE_BYTES);
 
        unsigned long           flags;
 
-       union journal_res_state reservations;
+       /* Max size of current journal entry */
        unsigned                cur_entry_u64s;
-       unsigned                prev_buf_sectors;
-       unsigned                cur_buf_sectors;
+       unsigned                cur_entry_sectors;
+
+       /* Reserved space in journal entry to be used just prior to write */
+       unsigned                entry_u64s_reserved;
+
+
+       /*
+        * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+        * insufficient devices:
+        */
+       enum journal_errors     cur_entry_error;
+
        unsigned                buf_size_want;
+       /*
+        * We may queue up some things to be journalled (log messages) before
+        * the journal has actually started - stash them here:
+        */
+       darray_u64              early_journal_entries;
 
+       /*
+        * Protects journal_buf->data, when accessing without a jorunal
+        * reservation: for synchronization between the btree write buffer code
+        * and the journal write path:
+        */
+       struct mutex            buf_lock;
        /*
         * Two journal entries -- one is currently open for new entries, the
         * other is possibly being written out.
         */
-       struct journal_buf      buf[2];
+       struct journal_buf      buf[JOURNAL_BUF_NR];
 
        spinlock_t              lock;
 
+       /* if nonzero, we may not open a new journal entry: */
+       unsigned                blocked;
+
        /* Used when waiting because the journal was full */
        wait_queue_head_t       wait;
+       struct closure_waitlist async_wait;
 
-       struct closure          io;
        struct delayed_work     write_work;
-       unsigned long           replicas_failed;
+       struct workqueue_struct *wq;
 
        /* Sequence number of most recent journal entry (last entry in @pin) */
        atomic64_t              seq;
 
-       /* last_seq from the most recent journal entry written */
+       /* seq, last_seq from the most recent journal entry successfully written */
+       u64                     seq_ondisk;
+       u64                     flushed_seq_ondisk;
        u64                     last_seq_ondisk;
+       u64                     err_seq;
+       u64                     last_empty_seq;
 
        /*
         * FIFO of journal entries whose btree updates have not yet been
@@ -165,42 +245,56 @@ struct journal {
         * needed. When all journal entries in the oldest journal bucket are no
         * longer needed, the bucket can be discarded and reused.
         */
-       DECLARE_FIFO(struct journal_entry_pin_list, pin);
-       struct journal_entry_pin_list *replay_pin_list;
+       struct {
+               u64 front, back, size, mask;
+               struct journal_entry_pin_list *data;
+       }                       pin;
 
-       /*
-        * Protects the pin lists - the fifo itself is still protected by
-        * j->lock though:
-        */
-       spinlock_t              pin_lock;
+       struct journal_space    space[journal_space_nr];
 
-       struct mutex            blacklist_lock;
-       struct list_head        seq_blacklist;
+       u64                     replay_journal_seq;
+       u64                     replay_journal_seq_end;
 
-       BKEY_PADDED(key);
        struct write_point      wp;
+       spinlock_t              err_lock;
+
+       struct mutex            reclaim_lock;
+       /*
+        * Used for waiting until journal reclaim has freed up space in the
+        * journal:
+        */
+       wait_queue_head_t       reclaim_wait;
+       struct task_struct      *reclaim_thread;
+       bool                    reclaim_kicked;
+       unsigned long           next_reclaim;
+       u64                     nr_direct_reclaim;
+       u64                     nr_background_reclaim;
 
-       struct delayed_work     reclaim_work;
        unsigned long           last_flushed;
+       struct journal_entry_pin *flush_in_progress;
+       bool                    flush_in_progress_dropped;
+       wait_queue_head_t       pin_flush_wait;
 
-       /* protects advancing ja->last_idx: */
-       struct mutex            reclaim_lock;
-       unsigned                write_delay_ms;
-       unsigned                reclaim_delay_ms;
+       /* protects advancing ja->discard_idx: */
+       struct mutex            discard_lock;
+       bool                    can_discard;
+
+       unsigned long           last_flush_write;
 
-       u64                     res_get_blocked_start;
-       u64                     need_write_time;
        u64                     write_start_time;
 
-       struct time_stats       *write_time;
-       struct time_stats       *delay_time;
-       struct time_stats       *blocked_time;
+       u64                     nr_flush_writes;
+       u64                     nr_noflush_writes;
+       u64                     entry_bytes_written;
+
+       struct time_stats       *flush_write_time;
+       struct time_stats       *noflush_write_time;
        struct time_stats       *flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map      res_map;
 #endif
-};
+} __aligned(SMP_CACHE_BYTES);
 
 /*
  * Embedded in struct bch_dev. First three fields refer to the array of journal
@@ -215,25 +309,29 @@ struct journal_device {
 
        unsigned                sectors_free;
 
-       /* Journal bucket we're currently writing to */
-       unsigned                cur_idx;
-
-       /* Last journal bucket that still contains an open journal entry */
-
        /*
-        * j->lock and j->reclaim_lock must both be held to modify, j->lock
-        * sufficient to read:
+        * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
         */
-       unsigned                last_idx;
+       unsigned                discard_idx;            /* Next bucket to discard */
+       unsigned                dirty_idx_ondisk;
+       unsigned                dirty_idx;
+       unsigned                cur_idx;                /* Journal bucket we're currently writing to */
        unsigned                nr;
+
        u64                     *buckets;
 
        /* Bio for journal reads/writes to this device */
-       struct bio              *bio;
-       u8                      ptr_idx;
+       struct journal_bio      *bio[JOURNAL_BUF_NR];
 
        /* for bch_journal_read_device */
        struct closure          read;
 };
 
+/*
+ * journal_entry_res - reserve space in every journal entry:
+ */
+struct journal_entry_res {
+       unsigned                u64s;
+};
+
 #endif /* _BCACHEFS_JOURNAL_TYPES_H */