1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_EXTENTS_FORMAT_H
3 #define _BCACHEFS_EXTENTS_FORMAT_H
6 * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
7 * preceded by checksum/compression information (bch_extent_crc32 or
10 * One major determining factor in the format of extents is how we handle and
11 * represent extents that have been partially overwritten and thus trimmed:
13 * If an extent is not checksummed or compressed, when the extent is trimmed we
14 * don't have to remember the extent we originally allocated and wrote: we can
15 * merely adjust ptr->offset to point to the start of the data that is currently
16 * live. The size field in struct bkey records the current (live) size of the
17 * extent, and is also used to mean "size of region on disk that we point to" in
20 * Thus an extent that is not checksummed or compressed will consist only of a
21 * list of bch_extent_ptrs, with none of the fields in
22 * bch_extent_crc32/bch_extent_crc64.
24 * When an extent is checksummed or compressed, it's not possible to read only
25 * the data that is currently live: we have to read the entire extent that was
26 * originally written, and then return only the part of the extent that is
29 * Thus, in addition to the current size of the extent in struct bkey, we need
30 * to store the size of the originally allocated space - this is the
31 * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
32 * when the extent is trimmed, instead of modifying the offset field of the
33 * pointer, we keep a second smaller offset field - "offset into the original
34 * extent of the currently live region".
36 * The other major determining factor is replication and data migration:
38 * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
39 * write, we will initially write all the replicas in the same format, with the
40 * same checksum type and compression format - however, when copygc runs later (or
41 * tiering/cache promotion, anything that moves data), it is not in general
42 * going to rewrite all the pointers at once - one of the replicas may be in a
43 * bucket on one device that has very little fragmentation while another lives
44 * in a bucket that has become heavily fragmented, and thus is being rewritten
45 * sooner than the rest.
47 * Thus it will only move a subset of the pointers (or in the case of
48 * tiering/cache promotion perhaps add a single pointer without dropping any
49 * current pointers), and if the extent has been partially overwritten it must
50 * write only the currently live portion (or copygc would not be able to reduce
51 * fragmentation!) - which necessitates a different bch_extent_crc format for
54 * But in the interests of space efficiency, we don't want to store one
55 * bch_extent_crc for each pointer if we don't have to.
57 * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
58 * bch_extent_ptrs appended arbitrarily one after the other. We determine the
59 * type of a given entry with a scheme similar to utf8 (except we're encoding a
60 * type, not a size), encoding the type in the position of the first set bit:
62 * bch_extent_crc32 - 0b1
63 * bch_extent_ptr - 0b10
64 * bch_extent_crc64 - 0b100
66 * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
67 * bch_extent_crc64 is the least constrained).
69 * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
70 * until the next bch_extent_crc32/64.
72 * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
73 * is neither checksummed nor compressed.
76 #define BCH_EXTENT_ENTRY_TYPES() \
83 #define BCH_EXTENT_ENTRY_MAX 6
85 enum bch_extent_entry_type {
86 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
87 BCH_EXTENT_ENTRY_TYPES()
91 /* Compressed/uncompressed size are stored biased by 1: */
92 struct bch_extent_crc32 {
93 #if defined(__LITTLE_ENDIAN_BITFIELD)
102 #elif defined (__BIG_ENDIAN_BITFIELD)
104 __u32 compression_type:4,
108 _uncompressed_size:7,
112 } __packed __aligned(8);
114 #define CRC32_SIZE_MAX (1U << 7)
115 #define CRC32_NONCE_MAX 0
117 struct bch_extent_crc64 {
118 #if defined(__LITTLE_ENDIAN_BITFIELD)
121 _uncompressed_size:9,
127 #elif defined (__BIG_ENDIAN_BITFIELD)
133 _uncompressed_size:9,
138 } __packed __aligned(8);
140 #define CRC64_SIZE_MAX (1U << 9)
141 #define CRC64_NONCE_MAX ((1U << 10) - 1)
143 struct bch_extent_crc128 {
144 #if defined(__LITTLE_ENDIAN_BITFIELD)
147 _uncompressed_size:13,
152 #elif defined (__BIG_ENDIAN_BITFIELD)
153 __u64 compression_type:4,
157 _uncompressed_size:13,
161 struct bch_csum csum;
162 } __packed __aligned(8);
164 #define CRC128_SIZE_MAX (1U << 13)
165 #define CRC128_NONCE_MAX ((1U << 13) - 1)
168 * @reservation - pointer hasn't been written to, just reserved
170 struct bch_extent_ptr {
171 #if defined(__LITTLE_ENDIAN_BITFIELD)
176 offset:44, /* 8 petabytes */
179 #elif defined (__BIG_ENDIAN_BITFIELD)
188 } __packed __aligned(8);
190 struct bch_extent_stripe_ptr {
191 #if defined(__LITTLE_ENDIAN_BITFIELD)
196 #elif defined (__BIG_ENDIAN_BITFIELD)
204 struct bch_extent_rebalance {
205 #if defined(__LITTLE_ENDIAN_BITFIELD)
208 compression:8, /* enum bch_compression_opt */
210 #elif defined (__BIG_ENDIAN_BITFIELD)
218 union bch_extent_entry {
219 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
221 #elif __BITS_PER_LONG == 32
227 #error edit for your odd byteorder.
230 #define x(f, n) struct bch_extent_##f f;
231 BCH_EXTENT_ENTRY_TYPES()
235 struct bch_btree_ptr {
239 struct bch_extent_ptr start[];
240 } __packed __aligned(8);
242 struct bch_btree_ptr_v2 {
247 __le16 sectors_written;
251 struct bch_extent_ptr start[];
252 } __packed __aligned(8);
254 LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
260 union bch_extent_entry start[];
261 } __packed __aligned(8);
263 /* Maximum size (in u64s) a single pointer could be: */
264 #define BKEY_EXTENT_PTR_U64s_MAX\
265 ((sizeof(struct bch_extent_crc128) + \
266 sizeof(struct bch_extent_ptr)) / sizeof(__u64))
268 /* Maximum possible size of an entire extent value: */
269 #define BKEY_EXTENT_VAL_U64s_MAX \
270 (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
272 /* * Maximum possible size of an entire extent, key + value: */
273 #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
275 /* Btree pointers don't carry around checksums: */
276 #define BKEY_BTREE_PTR_VAL_U64s_MAX \
277 ((sizeof(struct bch_btree_ptr_v2) + \
278 sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
279 #define BKEY_BTREE_PTR_U64s_MAX \
280 (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
282 struct bch_reservation {
288 } __packed __aligned(8);
290 struct bch_inline_data {
295 #endif /* _BCACHEFS_EXTENTS_FORMAT_H */