From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 20 Dec 2021 00:37:29 +0000 (-0500)
Subject: Update bcachefs sources to ff3a76e1af bcachefs: Change need_whiteout_for_snapshot... 
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d06f5690fab526c4a4a8a4d55f9c4e675d883be9;p=bcachefs-tools-debian

Update bcachefs sources to ff3a76e1af bcachefs: Change need_whiteout_for_snapshot() to clone iterator
---

diff --git a/.bcachefs_revision b/.bcachefs_revision
index d0a5221..4bae87b 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-635ca475f4f40ddcb2976f8f20a89df4c574aa22
+ff3a76e1af04f51506f45e0f71d53f7e6dd51a75
diff --git a/.gitignore b/.gitignore
index 8feb598..b1c03cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,4 +18,4 @@ tests/__pycache__/
 
 mount/target
 mount.bcachefs
-doc/bcachefs.5.rst
+bcachefs-principles-of-operation.*
diff --git a/Makefile b/Makefile
index e94419f..67c40e5 100644
--- a/Makefile
+++ b/Makefile
@@ -28,15 +28,6 @@ PYTEST_CMD?=$(shell \
 )
 PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS)
 
-RST2MAN_ARGS?=
-RST2MAN_CMD?=$(shell \
-	command -v rst2man \
-	|| which rst2man \
-	|| command -v rst2man.py \
-	|| which rst2man.py \
-)
-RST2MAN:=$(RST2MAN_CMD) $(RST2MAN_ARGS)
-
 CARGO_ARGS=
 CARGO=cargo $(CARGO_ARGS)
 CARGO_PROFILE=release
@@ -108,18 +99,6 @@ TAGS:
 tags:
 	ctags -R .
 
-DOCSRC := opts_macro.h bcachefs.5.rst.tmpl
-DOCGENERATED := bcachefs.5 doc/bcachefs.5.rst
-DOCDEPS := $(addprefix ./doc/,$(DOCSRC))
-bcachefs.5: $(DOCDEPS)  libbcachefs/opts.h
-ifneq (,$(RST2MAN_CMD))
-	$(CC) doc/opts_macro.h -I libbcachefs -I include -E 2>/dev/null	\
-		| doc/macro2rst.py
-	$(RST2MAN) doc/bcachefs.5.rst bcachefs.5
-else
-	@echo "WARNING: no rst2man found! Man page not generated."
-endif
-
 SRCS=$(shell find . -type f -iname '*.c')
 DEPS=$(SRCS:.c=.d)
 -include $(DEPS)
@@ -184,6 +163,11 @@ clean:
 deb: all
 	debuild -us -uc -nc -b -i -I
 
+bcachefs-principles-of-operation.pdf: bcachefs-principles-of-operation.tex
+	pdflatex bcachefs-principles-of-operation.tex && pdflatex bcachefs-principles-of-operation.tex
+
+doc: bcachefs-principles-of-operation.pdf
+
 .PHONY: update-bcachefs-sources
 update-bcachefs-sources:
 	git rm -rf --ignore-unmatch libbcachefs
diff --git a/bcachefs-principles-of-operation.tex b/bcachefs-principles-of-operation.tex
new file mode 100644
index 0000000..d5ac6ed
--- /dev/null
+++ b/bcachefs-principles-of-operation.tex
@@ -0,0 +1,1188 @@
+\documentclass{article}
+
+\usepackage{imakeidx}
+\usepackage[pdfborder={0 0 0}]{hyperref}
+\usepackage{longtable}
+
+\title{bcachefs: Principles of Operation}
+\author{Kent Overstreet}
+
+\date{}
+
+\begin{document}
+
+\maketitle
+\tableofcontents
+
+\section{Introduction and overview}
+
+Bcachefs is a modern, general purpose, copy on write filesystem descended from
+bcache, a block layer cache.
+
+The internal architecture is very different from most existing filesystems where
+the inode is central and many data structures hang off of the inode. Instead,
+bcachefs is architected more like a filesystem on top of a relational database,
+with tables for the different filesystem data types - extents, inodes, dirents,
+xattrs, et cetera.
+
+bcachefs supports almost all of the same features as other modern COW
+filesystems, such as ZFS and btrfs, but in general with a cleaner, simpler,
+higher performance design.
+
+\subsection{Performance overview}
+
+The core of the architecture is a very high performance and very low latency b+
+tree, which also is not a conventional b+ tree but more of hybrid, taking
+concepts from compacting data structures: btree nodes are very large, log
+structured, and compacted (resorted) as necessary in memory. This means our b+
+trees are very shallow compared to other filesystems.
+
+What this means for the end user is that since we require very few seeks or disk
+reads, filesystem latency is extremely good - especially cache cold filesystem
+latency, which does not show up in most benchmarks but has a huge impact on real
+world performance, as well as how fast the system "feels" in normal interactive
+usage. Latency has been a major focus throughout the codebase - notably, we have
+assertions that we never hold b+ tree locks while doing IO, and the btree
+transaction layer makes it easily to aggressively drop and retake locks as
+needed - one major goal of bcachefs is to be the first general purpose soft
+realtime filesystem.
+
+Additionally, unlike other COW btrees, btree updates are journalled. This
+greatly improves our write efficiency on random update workloads, as it means
+btree writes are only done when we have a large block of updates, or when
+required by memory reclaim or journal reclaim.
+
+\subsection{Bucket based allocation}
+
+As mentioned bcachefs is descended from bcache, where the ability to efficiently
+invalidate cached data and reuse disk space was a core design requirement. To
+make this possible the allocator divides the disk up into buckets, typically
+512k to 2M but possibly larger or smaller. Buckets and data pointers have
+generation numbers: we can reuse a bucket with cached data in it without finding
+and deleting all the data pointers by incrementing the generation number.
+
+In keeping with the copy-on-write theme of avoiding update in place wherever
+possible, we never rewrite or overwrite data within a bucket - when we allocate
+a bucket, we write to it sequentially and then we don't write to it again until
+the bucket has been invalidated and the generation number incremented.
+
+This means we require a copying garbage collector to deal with internal
+fragmentation, when patterns of random writes leave us with many buckets that
+are partially empty (because the data they contained was overwritten) - copy GC
+evacuates buckets that are mostly empty by writing the data they contain to new
+buckets. This also means that we need to reserve space on the device for the
+copy GC reserve when formatting - typically 8\% or 12\%.
+
+There are some advantages to structuring the allocator this way, besides being
+able to support cached data:
+\begin{itemize}
+	\item By maintaining multiple write points that are writing to different buckets,
+		we're able to easily and naturally segregate unrelated IO from different
+		processes, which helps greatly with fragmentation.
+
+	\item The fast path of the allocator is essentially a simple bump allocator - the
+		disk space allocation is extremely fast
+
+	\item Fragmentation is generally a non issue unless copygc has to kick
+		in, and it usually doesn't under typical usage patterns. The
+		allocator and copygc are doing essentially the same things as
+		the flash translation layer in SSDs, but within the filesystem
+		we have much greater visibility into where writes are coming
+		from and how to segregate them, as well as which data is
+		actually live - performance is generally more predictable than
+		with SSDs under similar usage patterns.
+
+	\item The same algorithms will in the future be used for managing SMR
+		hard drives directly, avoiding the translation layer in the hard
+		drive - doing this work within the filesystem should give much
+		better performance and much more predictable latency.
+\end{itemize}
+
+\section{Feature overview}
+
+\subsection{IO path options}
+
+Most options that control the IO path can be set at either the filesystem level
+or on individual inodes (files and directories). When set on a directory via the
+\texttt{bcachefs attr} command, they will be automatically applied recursively.
+
+\subsubsection{Checksumming}
+
+bcachefs supports both metadata and data checksumming - crc32c by default, but
+stronger checksums are available as well. Enabling data checksumming incurs some
+performance overhead - besides the checksum calculation, writes have to be
+bounced for checksum stability (Linux generally cannot guarantee that the buffer
+being written is not modified in flight), but reads generally do not have to be
+bounced.
+
+Checksum granularity in bcachefs is at the level of individual extents, which
+results in smaller metadata but means we have to read entire extents in order to
+verify the checksum. By default, checksummed and compressed extents are capped
+at 64k. For most applications and usage scenarios this is an ideal trade off, but
+small random \texttt{O\_DIRECT} reads will incur significant overhead. In the
+future, checksum granularity will be a per-inode option.
+
+\subsubsection{Encryption}
+
+bcachefs supports authenticated (AEAD style) encryption - ChaCha20/Poly1305.
+When encryption is enabled, the poly1305 MAC replaces the normal data and
+metadata checksums. This style of encryption is superior to typical block layer
+or filesystem level encryption (usually AES-XTS), which only operates on blocks
+and doesn't have a way to store nonces or MACs. In contrast, we store a nonce
+and cryptographic MAC alongside data pointers - meaning we have a chain of trust
+up to the superblock (or journal, in the case of unclean shutdowns) and can
+definitely tell if metadata has been modified, dropped, or replaced with an
+earlier version - replay attacks are not possible.
+
+Encryption can only be specified for the entire filesystem, not per file or
+directory - this is because metadata blocks do not belong to a particular file.
+All metadata except for the superblock is encrypted.
+
+In the future we'll probably add AES-GCM for platforms that have hardware
+acceleration for AES, but in the meantime software implementations of ChaCha20
+are also quite fast on most platforms.
+
+\texttt{scrypt} is used for the key derivation function - for converting the
+user supplied passphrase to an encryption key.
+
+To format a filesystem with encryption, use
+\begin{quote} \begin{verbatim}
+bcachefs format --encrypted /dev/sda1
+\end{verbatim} \end{quote}
+
+You will be prompted for a passphrase. Then, to use an encrypted filesystem
+use the command
+\begin{quote} \begin{verbatim}
+bcachefs unlock /dev/sda1
+\end{verbatim} \end{quote}
+
+You will be prompted for the passphrase and the encryption key will be added to
+your in-kernel keyring; mount, fsck and other commands will then work as usual.
+
+The passphrase on an existing encrypted filesystem can be changed with the
+\texttt{bcachefs set-passphrase} command. To permanently unlock an encrypted
+filesystem, use the \texttt{bcachefs remove-passphrase} command - this can be
+useful when dumping filesystem metadata for debugging by the developers.
+
+There is a \texttt{wide\_macs} option which controls the size of the
+cryptographic MACs stored on disk. By default, only 80 bits are stored, which
+should be sufficient security for most applications. With the
+\texttt{wide\_macs} option enabled we store the full 128 bit MAC, at the cost of
+making extents 8 bytes bigger.
+
+\subsubsection{Compression}
+
+bcachefs supports gzip, lz4 and zstd compression. As with data checksumming, we
+compress entire extents, not individual disk blocks - this gives us better
+compression ratios than other filesystems, at the cost of reduced small random
+read performance.
+
+Data can also be compressed or recompressed with a different algorithm in the
+background by the rebalance thread, if the \texttt{background\_compression}
+option is set.
+
+\subsection{Multiple devices}
+
+bcachefs is a multi-device filesystem. Devices need not be the same size: by
+default, the allocator will stripe across all available devices but biasing in
+favor of the devices with more free space, so that all devices in the filesystem
+fill up at the same rate. Devices need not have the same performance
+characteristics: we track device IO latency and direct reads to the device that
+is currently fastest.
+
+\subsubsection{Replication}
+
+bcachefs supports standard RAID1/10 style redundancy with the
+\texttt{data\_replicas} and \texttt{metadata\_replicas} options. Layout is not
+fixed as with RAID10: a given extent can be replicated across any set of
+devices; the \texttt{bcachefs fs usage} command shows how data is replicated
+within a filesystem.
+
+\subsubsection{Erasure coding}
+
+bcachefs also supports Reed-Solomon erasure coding - the same algorithm used by
+most RAID5/6 implementations) When enabled with the \texttt{ec} option, the
+desired redundancy is taken from the \texttt{data\_replicas} option - erasure
+coding of metadata is not supported.
+
+Erasure coding works significantly differently from both conventional RAID
+implementations and other filesystems with similar features. In conventional
+RAID, the "write hole" is a significant problem - doing a small write within a
+stripe requires the P and Q (recovery) blocks to be updated as well, and since
+those writes cannot be done atomically there is a window where the P and Q
+blocks are inconsistent - meaning that if the system crashes and recovers with a
+drive missing, reconstruct reads for unrelated data within that stripe will be
+corrupted.
+
+ZFS avoids this by fragmenting individual writes so that every write becomes a
+new stripe - this works, but the fragmentation has a negative effect on
+performance: metadata becomes bigger, and both read and write requests are
+excessively fragmented. Btrfs's erasure coding implementation is more
+conventional, and still subject to the write hole problem.
+
+bcachefs's erasure coding takes advantage of our copy on write nature - since
+updating stripes in place is a problem, we simply don't do that. And since
+excessively small stripes is a problem for fragmentation, we don't erasure code
+individual extents, we erasure code entire buckets - taking advantage of bucket
+based allocation and copying garbage collection.
+
+When erasure coding is enabled, writes are initially replicated, but one of the
+replicas is allocated from a bucket that is queued up to be part of a new
+stripe. When we finish filling up the new stripe, we write out the P and Q
+buckets and then drop the extra replicas for all the data within that stripe -
+the effect is similar to full data journalling, and it means that after erasure
+coding is done the layout of our data on disk is ideal.
+
+Since disks have write caches that are only flushed when we issue a cache flush
+command - which we only do on journal commit - if we can tweak the allocator so
+that the buckets used for the extra replicas are reused (and then overwritten
+again) immediately, this full data journalling should have negligible overhead -
+this optimization is not implemented yet, however.
+
+\subsubsection{Device labels and targets}
+
+By default, writes are striped across all devices in a filesystem, but they may
+be directed to a specific device or set of devices with the various target
+options. The allocator only prefers to allocate from devices matching the
+specified target; if those devices are full, it will fall back to allocating
+from any device in the filesystem.
+
+Target options may refer to a device directly, e.g.
+\texttt{foreground\_target=/dev/sda1}, or they may refer to a device label. A
+device label is a path delimited by periods - e.g. ssd.ssd1 (and labels need not
+be unique). This gives us ways of referring to multiple devices in target
+options: If we specify ssd in a target option, that will refer to all devices
+with the label ssd or labels that start with ssd. (e.g. ssd.ssd1, ssd.ssd2).
+
+Four target options exist. These options all may be set at the filesystem level
+(at format time, at mount time, or at runtime via sysfs), or on a particular
+file or directory:
+
+\begin{description}
+	\item \texttt{foreground\_target}: normal foreground data writes, and
+		metadata if \\ \texttt{metadata\_target} is not set
+	\item \texttt{metadata\_target}: btree writes
+	\item \texttt{background\_target}: If set, user data (not metadata) will
+		be moved to this target in the background
+	\item\texttt{promote\_target}: If set, a cached copy will be added to
+		this target on read, if none exists
+\end{description}
+
+\subsubsection{Caching}
+
+When an extent has multiple copies on different devices, some of those copies
+may be marked as cached. Buckets containing only cached data are discarded as
+needed by the allocator in LRU order.
+
+When data is moved from one device to another according to the \\
+\texttt{background\_target} option, the original copy is left in place but
+marked as cached. With the \texttt{promote\_target} option, the original copy is
+left unchanged and the new copy on the \texttt{promote\_target} device is marked
+as cached.
+
+To do writeback caching, set \texttt{foreground\_target} and
+\texttt{promote\_target} to the cache device, and \texttt{background\_target} to
+the backing device. To do writearound caching, set \texttt{foreground\_target}
+to the backing device and \texttt{promote\_target} to the cache device.
+
+\subsubsection{Durability}
+
+Some devices may be considered to be more reliable than others. For example, we
+might have a filesystem composed of a hardware RAID array and several NVME flash
+devices, to be used as cache. We can set replicas=2 so that losing any of the
+NVME flash devices will not cause us to lose data, and then additionally we can
+set durability=2 for the hardware RAID device to tell bcachefs that we don't
+need extra replicas for data on that device - data on that device will count as
+two replicas, not just one.
+
+The durability option can also be used for writethrough caching: by setting
+durability=0 for a device, it can be used as a cache and only as a cache -
+bcachefs won't consider copies on that device to count towards the number of
+replicas we're supposed to keep.
+
+\subsection{Reflink}
+
+bcachefs supports reflink, similarly to other filesystems with the same feature.
+cp --reflink will create a copy that shares the underlying storage. Reading from
+that file will become slightly slower - the extent pointing to that data is
+moved to the reflink btree (with a refcount added) and in the extents btree we
+leave a key that points to the indirect extent in the reflink btree, meaning
+that we now have to do two btree lookups to read from that data instead of just
+one.
+
+\subsection{Inline data extents}
+
+bcachefs supports inline data extents, controlled by the \texttt{inline\_data}
+option (on by default). When the end of a file is being written and is smaller
+than half of the filesystem blocksize, it will be written as an inline data
+extent. Inline data extents can also be reflinked (moved to the reflink btree
+with a refcount added): as a todo item we also intend to support compressed
+inline data extents.
+
+\subsection{Subvolumes and snapshots}
+
+bcachefs supports subvolumes and snapshots with a similar userspace interface as
+btrfs. A new subvolume may be created empty, or it may be created as a snapshot
+of another subvolume. Snapshots are writeable and may be snapshotted again,
+creating a tree of snapshots.
+
+Snapshots are very cheap to create: they're not based on cloning of COW btrees
+as with btrfs, but instead are based on versioning of individual keys in the
+btrees. Many thousands or millions of snapshots can be created, with the only
+limitation being disk space.
+
+The following subcommands exist for managing subvolumes and snapshots:
+\begin{itemize}
+	\item \texttt{bcachefs subvolume create}: Create a new, empty subvolume
+	\item \texttt{bcachefs subvolume destroy}: Delete an existing subvolume
+		or snapshot
+	\item \texttt{bcachefs subvolume snapshot}: Create a snapshot of an
+		existing subvolume
+\end{itemize}
+
+A subvolume can also be deleting with a normal rmdir after deleting all the
+contents, as with \texttt{rm -rf}. Still to be implemented: read-only snapshots,
+recursive snapshot creation, and a method for recursively listing subvolumes.
+
+\subsection{Quotas}
+
+bcachefs supports conventional user/group/project quotas. Quotas do not
+currently apply to snapshot subvolumes, because if a file changes ownership in
+the snapshot it would be ambiguous as to what quota data within that file
+should be charged to.
+
+When a directory has a project ID set it is inherited automatically by
+descendants on creation and rename. When renaming a directory would cause the
+project ID to change we return -EXDEV so that the move is done file by file, so
+that the project ID is propagated correctly to descendants - thus, project
+quotas can be used as subdirectory quotas.
+
+\section{Management}
+
+\subsection{Formatting}
+
+To format a new bcachefs filesystem use the subcommand \texttt{bcachefs
+format}, or \texttt{mkfs.bcachefs}. All persistent filesystem-wide options can
+be specified at format time. For an example of a multi device filesystem with
+compression, encryption, replication and writeback caching:
+\begin{quote} \begin{verbatim}
+bcachefs format --compression=lz4               \
+                --encrypted                     \
+                --replicas=2                    \
+                --label=ssd.ssd1 /dev/sda       \
+                --label=ssd.ssd2 /dev/sdb       \
+                --label=hdd.hdd1 /dev/sdc       \
+                --label=hdd.hdd2 /dev/sdd       \
+                --label=hdd.hdd3 /dev/sde       \
+                --label=hdd.hdd4 /dev/sdf       \
+                --foreground_target=ssd	        \
+                --promote_target=ssd            \
+                --background_target=hdd
+\end{verbatim} \end{quote}
+
+\subsection{Mounting}
+
+To mount a multi device filesystem, there are two options. You can specify all
+component devices, separated by hyphens, e.g. 
+\begin{quote} \begin{verbatim}
+mount -t bcachefs /dev/sda:/dev/sdb:/dev/sdc /mnt
+\end{verbatim} \end{quote}
+Or, use the mount.bcachefs tool to mount by filesystem UUID. Still todo: improve
+the mount.bcachefs tool to support mounting by filesystem label.
+
+No special handling is needed for recovering from unclean shutdown. Journal
+replay happens automatically, and diagnostic messages in the dmesg log will
+indicate whether recovery was from clean or unclean shutdown.
+
+The \texttt{-o degraded} option will allow a filesystem to be mounted without
+all the the devices, but will fail if data would be missing. The
+\texttt{-o very\_degraded} can be used to attempt mounting when data would be
+missing.
+
+Also relevant is the \texttt{-o nochanges} option. It disallows any and all
+writes to the underlying devices, pinning dirty data in memory as necessary if
+for example journal replay was necessary - think of it as a "super read-only"
+mode. It can be used for data recovery, and for testing version upgrades.
+
+The \texttt{-o verbose} enables additional log output during the mount process.
+
+\subsection{Fsck}
+
+It is possible to run fsck either in userspace with the \texttt{bcachefs fsck}
+subcommand (also available as \texttt{fsck.bcachefs}, or in the kernel while
+mounting by specifying the \texttt{-o fsck} mount option. In either case the
+exact same fsck implementation is being run, only the environment is different.
+Running fsck in the kernel at mount time has the advantage of somewhat better
+performance, while running in userspace has the ability to be stopped with
+ctrl-c and can prompt the user for fixing errors. To fix errors while running
+fsck in the kernel, use the \texttt{-o fix\_errors} option.
+
+The \texttt{-n} option passed to fsck implies the \texttt{-o nochanges} option;
+\texttt{bcachefs fsck -ny} can be used to test filesystem repair in dry-run
+mode.
+
+\subsection{Status of data}
+
+The \texttt{bcachefs fs usage} may be used to display filesystem usage broken
+out in various ways. Data usage is broken out by type: superblock, journal,
+btree, data, cached data, and parity, and by which sets of devices extents are
+replicated across. We also give per-device usage which includes fragmentation
+due to partially used buckets.
+
+\subsection{Journal}
+
+The journal has a number of tunables that affect filesystem performance. Journal
+commits are fairly expensive operations as they require issuing FLUSH and FUA
+operations to the underlying devices. By default, we issue a journal flush one
+second after a filesystem update has been done; this is controlled with the
+\texttt{journal\_flush\_delay} option, which takes a parameter in milliseconds.
+
+Filesystem sync and fsync operations issue journal flushes; this can be disabled
+with the \texttt{journal\_flush\_disabled} option - the
+\texttt{journal\_flush\_delay} option will still apply, and in the event of a
+system crash we will never lose more than (by default) one second of work. This
+option may be useful on a personal workstation or laptop, and perhaps less
+appropriate on a server.
+
+The journal reclaim thread runs in the background, kicking off btree node writes
+and btree key cache flushes to free up space in the journal. Even in the absence
+of space pressure it will run slowly in the background: this is controlled by
+the \texttt{journal\_reclaim\_delay} parameter, with a default of 100
+milliseconds.
+
+The journal should be sized sufficiently that bursts of activity do not fill up
+the journal too quickly; also, a larger journal mean that we can queue up larger
+btree writes. The \texttt{bcachefs device resize-journal} can be used for
+resizing the journal on disk on a particular device - it can be used on a
+mounted or unmounted filesystem.
+
+In the future, we should implement a method to see how much space is currently
+utilized in the journal.
+
+\subsection{Device management}
+
+\subsubsection{Filesystem resize}
+
+A filesystem can be resized on a particular device with the
+\texttt{bcachefs device resize} subcommand. Currently only growing is supported,
+not shrinking.
+
+\subsubsection{Device add/removal}
+
+The following subcommands exist for adding and removing devices from a mounted
+filesystem:
+\begin{itemize}
+	\item \texttt{bcachefs device add}: Formats and adds a new device to an
+		existing filesystem.
+	\item \texttt{bcachefs device remove}: Permenantly removes a device from
+		an existing filesystem.
+	\item \texttt{bcachefs device online}: Connects a device to a running
+		filesystem that was mounted without it (i.e. in degraded mode)
+	\item \texttt{bcachefs device offline}: Disconnects a device from a
+		mounted filesystem without removing it.
+	\item \texttt{bcachefs device evacuate}: Migrates data off of a
+		particular device to prepare for removal, setting it read-only
+		if necessary.
+	\item \texttt{bcachefs device set-state}: Changes the state of a member
+		device: one of rw (readwrite), ro (readonly), failed, or spare.
+
+		A failed device is considered to have 0 durability, and replicas
+		on that device won't be counted towards the number of replicas
+		an extent should have by rereplicate - however, bcachefs will
+		still attempt to read from devices marked as failed.
+\end{itemize}
+
+The \texttt{bcachefs device remove}, \texttt{bcachefs device offline} and
+\texttt{bcachefs device set-state} commands take force options for when they
+would leave the filesystem degraded or with data missing. Todo: regularize and
+improve those options.
+
+\subsection{Data management}
+
+\subsubsection{Data rereplicate}
+
+The \texttt{bcachefs data rereplicate} command may be used to scan for extents
+that have insufficient replicas and write additional replicas, e.g. after a
+device has been removed from a filesystem or after replication has been enabled
+or increased.
+
+\subsubsection{Rebalance}
+
+To be implemented: a command for moving data between devices to equalize usage
+on each device. Not normally required because the allocator attempts to equalize
+usage across devices as it stripes, but can be necessary in certain scenarios -
+i.e. when a two-device filesystem with replication enabled that is very full has
+a third device added.
+
+\subsubsection{Scrub}
+
+To be implemented: a command for reading all data within a filesystem and
+ensuring that checksums are valid, fixing bitrot when a valid copy can be found.
+
+\section{Options}
+
+Most bcachefs options can be set filesystem wide, and a significant subset can
+also be set on inodes (files and directories), overriding the global defaults.
+Filesystem wide options may be set when formatting, when mounting, or at runtime
+via \texttt{/sys/fs/bcachefs/<uuid>/options/}. When set at runtime via sysfs the
+persistent options in the superblock are updated as well; when options are
+passed as mount parameters the persistent options are unmodified.
+
+\subsection{File and directory options}
+
+Options set on inodes (files and directories) are automatically inherited by
+their descendants, and inodes also record whether a given option was explicitly
+set or inherited from their parent. When renaming a directory would cause
+inherited attributes to change we fail the rename with -EXDEV, causing userspace
+to do the rename file by file so that inherited attributes stay consistent.
+
+Inode options are available as extended attributes. The options that have been
+explicitly set are available under the \texttt{bcachefs} namespace, and the effective
+options (explicitly set and inherited options) are available under the
+\texttt{bcachefs\_effective} namespace. Examples of listing options with the
+getfattr command:
+
+\begin{quote} \begin{verbatim}
+$ getfattr -d -m '^bcachefs\.' filename
+$ getfattr -d -m '^bcachefs_effective\.' filename
+\end{verbatim} \end{quote}
+
+Options may be set via the extended attribute interface, but it is preferable to
+use the \texttt{bcachefs setattr} command as it will correctly propagate options
+recursively.
+
+\subsection{Full option list}
+
+\begin{tabbing}
+\hspace{0.2in} \= \kill
+	\texttt{block\_size}		\` \textbf{format}			\\
+	\> \parbox{4.3in}{Filesystem block size (default 4k)}			\\ \\
+
+	\texttt{btree\_node\_size}	\` \textbf{format}			\\
+	\> Btree node size, default 256k					\\ \\
+
+	\texttt{errors}			\` \textbf{format,mount,rutime}		\\
+	\> Action to take on filesystem error					\\ \\
+
+	\texttt{metadata\_replicas}	\` \textbf{format,mount,runtime}	\\
+	\> Number of replicas for metadata (journal and btree)			\\ \\
+
+	\texttt{data\_replicas}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Number of replicas for user data					\\ \\
+
+	\texttt{replicas}		\` \textbf{format}			\\
+	\> Alias for both metadata\_replicas and data\_replicas			\\ \\
+
+	\texttt{metadata\_checksum}	\` \textbf{format,mount,runtime}	\\
+	\> Checksum type for metadata writes					\\ \\
+
+	\texttt{data\_checksum}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Checksum type for data writes					\\ \\
+
+	\texttt{compression}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Compression type							\\ \\
+
+	\texttt{background\_compression} \` \textbf{format,mount,runtime,inode}	\\
+	\> Background compression type						\\ \\
+
+	\texttt{str\_hash}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Hash function for string hash tables (directories and xattrs)	\\ \\
+
+	\texttt{metadata\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Preferred target for metadata writes					\\ \\
+
+	\texttt{foreground\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Preferred target for foreground writes				\\ \\
+
+	\texttt{background\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Target for data to be moved to in the background			\\ \\
+
+	\texttt{promote\_target}	\` \textbf{format,mount,runtime,inode}	\\
+	\> Target for data to be copied to on read				\\ \\
+
+	\texttt{erasure\_code}		\` \textbf{format,mount,runtime,inode}	\\
+	\> Enable erasure coding						\\ \\
+
+	\texttt{inodes\_32bit}		\` \textbf{format,mount,runtime}	\\
+	\> Restrict new inode numbers to 32 bits				\\ \\
+
+	\texttt{shard\_inode\_numbers}	\` \textbf{format,mount,runtime}	\\
+	\> Use CPU id for high bits of new inode numbers. 			\\ \\
+
+	\texttt{wide\_macs}		\` \textbf{format,mount,runtime}	\\
+	\> Store full 128 bit cryptographic MACs (default 80)			\\ \\
+
+	\texttt{inline\_data}		\` \textbf{format,mount,runtime}	\\
+	\> Enable inline data extents (default on)				\\ \\
+
+	\texttt{journal\_flush\_delay}	\` \textbf{format,mount,runtime}	\\
+	\> Delay in milliseconds before automatic journal commit (default 1000)	\\ \\
+
+	\texttt{journal\_flush\_disabled}\`\textbf{format,mount,runtime}	\\
+	\> \begin{minipage}{4.3in}Disables journal flush on sync/fsync.
+		\texttt{journal\_flush\_delay}	remains in effect, thus with the
+		default setting not more than 1 second of work will be lost.
+	\end{minipage}								\\ \\
+
+	\texttt{journal\_reclaim\_delay}\` \textbf{format,mount,runtime}	\\
+	\> Delay in milliseconds before automatic journal reclaim		\\ \\
+
+	\texttt{acl}			\` \textbf{format,mount}		\\
+	\> Enable POSIX ACLs							\\ \\
+
+	\texttt{usrquota}		\` \textbf{format,mount}		\\
+	\> Enable user quotas							\\ \\
+
+	\texttt{grpquota}		\` \textbf{format,mount}		\\
+	\> Enable group quotas							\\ \\
+
+	\texttt{prjquota}		\` \textbf{format,mount}		\\
+	\> Enable project quotas						\\ \\
+
+	\texttt{degraded}		\` \textbf{mount}			\\
+	\> Allow mounting with data degraded					\\ \\
+
+	\texttt{very\_degraded}		\` \textbf{mount}			\\
+	\> Allow mounting with data missing					\\ \\
+
+	\texttt{verbose}		\` \textbf{mount}			\\
+	\> Extra debugging info during mount/recovery				\\ \\
+
+	\texttt{fsck}			\` \textbf{mount}			\\
+	\> Run fsck during mount						\\ \\
+
+	\texttt{fix\_errors}		\` \textbf{mount}			\\
+	\> Fix errors without asking during fsck				\\ \\
+
+	\texttt{ratelimit\_errors}	\` \textbf{mount}			\\
+	\> Ratelimit error messages during fsck					\\ \\
+
+	\texttt{read\_only}		\` \textbf{mount}			\\
+	\> Mount in read only mode						\\ \\
+
+	\texttt{nochanges}		\` \textbf{mount}			\\
+	\> Issue no writes, even for journal replay				\\ \\
+
+	\texttt{norecovery}		\` \textbf{mount}			\\
+	\> Don't replay the journal (not recommended)				\\ \\
+
+	\texttt{noexcl}			\` \textbf{mount}			\\
+	\> Don't open devices in exclusive mode					\\ \\
+
+	\texttt{version\_upgrade}	\` \textbf{mount}			\\
+	\> Upgrade on disk format to latest version				\\ \\
+
+	\texttt{discard}		\` \textbf{device}			\\
+	\> Enable discard/TRIM support						\\ \\
+\end{tabbing}
+
+\subsection{Error actions}
+The \texttt{errors} option is used for inconsistencies that indicate some sort
+of a bug. Valid error actions are:
+\begin{description}
+	\item[{\tt continue}] Log the error but continue normal operation
+	\item[{\tt ro}] Emergency read only, immediately halting any changes
+		to the filesystem on disk
+	\item[{\tt panic}] Immediately halt the entire machine, printing a
+		backtrace on the system console
+\end{description}
+
+\subsection{Checksum types}
+Valid checksum types are:
+\begin{description}
+	\item[{\tt none}]
+	\item[{\tt crc32c}] (default)
+	\item[{\tt crc64}]
+\end{description}
+
+\subsection{Compression types}
+Valid compression types are:
+\begin{description}
+	\item[{\tt none}] (default)
+	\item[{\tt lz4}]
+	\item[{\tt gzip}]
+	\item[{\tt zstd}]
+\end{description}
+
+\subsection{String hash types}
+Valid hash types for string hash tables are:
+\begin{description}
+	\item[{\tt crc32c}]
+	\item[{\tt crc64}]
+	\item[{\tt siphash}] (default)
+\end{description}
+
+\section{Debugging tools}
+
+\subsection{Sysfs interface}
+
+Mounted filesystems are available in sysfs at \texttt{/sys/fs/bcachefs/<uuid>/}
+with various options, performance counters and internal debugging aids.
+
+\subsubsection{Options}
+
+Filesystem options may be viewed and changed via \\
+\texttt{/sys/fs/bcachefs/<uuid>/options/}, and settings changed via sysfs will
+be persistently changed in the superblock as well.
+
+\subsubsection{Time stats}
+
+bcachefs tracks the latency and frequency of various operations and events, with
+quantiles for latency/duration in the
+\texttt{/sys/fs/bcachefs/<uuid>/time\_stats/} directory.
+
+\begin{description}
+	\item \texttt{blocked\_allocate} \\
+		Tracks when allocating a bucket must wait because none are
+		immediately available, meaning the copygc thread is not keeping
+		up with evacuating mostly empty buckets or the allocator thread
+		is not keeping up with invalidating and discarding buckets.
+
+	\item \texttt{blocked\_allocate\_open\_bucket} \\
+		Tracks when allocating a bucket must wait because all of our
+		handles for pinning open buckets are in use (we statically
+		allocate 1024).
+
+	\item \texttt{blocked\_journal} \\
+		Tracks when getting a journal reservation must wait, either
+		because journal reclaim isn't keeping up with reclaiming space
+		in the journal, or because journal writes are taking too long to
+		complete and we already have too many in flight.
+
+	\item \texttt{btree\_gc} \\
+		Tracks when the btree\_gc code must walk the btree at runtime -
+		for recalculating the oldest outstanding generation number of
+		every bucket in the btree.
+
+	\item \texttt{btree\_lock\_contended\_read}
+	\item \texttt{btree\_lock\_contended\_intent}
+	\item \texttt{btree\_lock\_contended\_write} \\
+		Track when taking a read, intent or write lock on a btree node
+		must block.
+
+	\item \texttt{btree\_node\_mem\_alloc} \\
+		Tracks the total time to allocate memory in the btree node cache
+		for a new btree node.
+
+	\item \texttt{btree\_node\_split} \\
+		Tracks btree node splits - when a btree node becomes full and is
+		split into two new nodes
+
+	\item \texttt{btree\_node\_compact} \\
+		Tracks btree node compactions - when a btree node becomes full
+		and needs to be compacted on disk.
+
+	\item \texttt{btree\_node\_merge} \\
+		Tracks when two adjacent btree nodes are merged.
+
+	\item \texttt{btree\_node\_sort} \\
+		Tracks sorting and resorting entire btree nodes in memory,
+		either after reading them in from disk or for compacting prior
+		to creating a new sorted array of keys.
+
+	\item \texttt{btree\_node\_read} \\
+		Tracks reading in btree nodes from disk.
+
+	\item \texttt{btree\_interior\_update\_foreground} \\
+		Tracks foreground time for btree updates that change btree
+		topology - i.e. btree node splits, compactions and merges; the
+		duration measured roughly corresponds to lock held time.
+
+	\item \texttt{btree\_interior\_update\_total} \\
+		Tracks time to completion for topology changing btree updates;
+		first they have a foreground part that updates btree nodes in
+		memory, then after the new nodes are written there is a
+		transaction phase that records an update to an interior node or
+		a new btree root as well as changes to the alloc btree.
+
+	\item \texttt{data\_read} \\
+		Tracks the core read path - looking up a request in the extents
+		(and possibly also reflink) btree, allocating bounce buffers if
+		necessary, issuing reads, checksumming, decompressing, decrypting,
+		and delivering completions.
+
+	\item \texttt{data\_write} \\
+		Tracks the core write path - allocating space on disk for a new
+		write, allocating bounce buffers if necessary,
+		compressing, encrypting, checksumming, issuing writes, and
+		updating the extents btree to point to the new data.
+
+	\item \texttt{data\_promote} \\
+		Tracks promote operations, which happen when a read operation
+		writes an additional cached copy of an extent to
+		\texttt{promote\_target}. This is done asynchronously from the
+		original read.
+
+	\item \texttt{journal\_flush\_write} \\
+		Tracks writing of flush journal entries to disk, which first
+		issue cache flush operations to the underlying devices then
+		issue the journal writes as FUA writes. Time is tracked starting
+		from after all journal reservations have released their
+		references or the completion of the previous journal write.
+
+	\item \texttt{journal\_noflush\_write} \\
+		Tracks writing of non-flush journal entries to disk, which do
+		not issue cache flushes or FUA writes.
+
+	\item \texttt{journal\_flush\_seq} \\
+		Tracks time to flush a journal sequence number to disk by
+		filesystem sync and fsync operations, as well as the allocator
+		prior to reusing buckets when none that do not need flushing are
+		available.
+\end{description}
+
+\subsubsection{Internals}
+
+\begin{description}
+	\item \texttt{btree\_cache} \\
+		Shows information on the btree node cache: number of cached
+		nodes, number of dirty nodes, and whether the cannibalize lock
+		(for reclaiming cached nodes to allocate new nodes) is held.
+
+	\item \texttt{dirty\_btree\_nodes} \\
+		Prints information related to the interior btree node update
+		machinery, which is responsible for ensuring dependent btree
+		node writes are ordered correctly.
+
+		For each dirty btree node, prints:
+		\begin{itemize}
+			\item Whether the \texttt{need\_write} flag is set
+			\item The level of the btree node
+			\item The number of sectors written
+			\item Whether writing this node is blocked, waiting for
+				other nodes to be written
+			\item Whether it is waiting on a btree\_update to
+				complete and make it reachable on-disk
+		\end{itemize}
+
+	\item \texttt{btree\_key\_cache} \\
+		Prints infromation on the btree key cache: number of freed keys
+		(which must wait for a sRCU barrier to complete before being
+		freed), number of cached keys, and number of dirty keys.
+
+	\item \texttt{btree\_transactions} \\
+		Lists each running btree transactions that has locks held,
+		listing which nodes they have locked and what type of lock, what
+		node (if any) the process is blocked attempting to lock, and
+		where the btree transaction was invoked from.
+
+	\item \texttt{btree\_updates} \\
+		Lists outstanding interior btree updates: the mode (nothing
+		updated yet, or updated a btree node, or wrote a new btree root,
+		or was reparented by another btree update), whether its new
+		btree nodes have finished writing, its embedded closure's
+		refcount (while nonzero, the btree update is still waiting), and
+		the pinned journal sequence number.
+
+	\item \texttt{journal\_debug} \\
+		Prints a variety of internal journal state.
+
+	\item \texttt{journal\_pins}
+		Lists items pinning journal entries, preventing them from being
+		reclaimed.
+
+	\item \texttt{new\_stripes} \\
+		Lists new erasure-coded stripes being created.
+
+	\item \texttt{stripes\_heap} \\
+		Lists erasure-coded stripes that are available to be reused.
+
+	\item \texttt{open\_buckets} \\
+		Lists buckets currently being written to, along with data type
+		and refcount.
+
+	\item \texttt{io\_timers\_read} \\
+	\item \texttt{io\_timers\_write} \\
+		Lists outstanding IO timers - timers that wait on total reads or
+		writes to the filesystem.
+
+	\item \texttt{trigger\_journal\_flush} \\
+		Echoing to this file triggers a journal commit.
+
+	\item \texttt{trigger\_gc} \\
+		Echoing to this file causes the GC code to recalculate each
+		bucket's oldest\_gen field.
+
+	\item \texttt{prune\_cache} \\
+		Echoing to this file prunes the btree node cache.
+
+	\item \texttt{read\_realloc\_races} \\
+		This counts events where the read path reads an extent and
+		discovers the bucket that was read from has been reused while
+		the IO was in flight, causing the read to be retried.
+
+	\item \texttt{extent\_migrate\_done} \\
+		This counts extents moved by the core move path, used by copygc
+		and rebalance.
+
+	\item \texttt{extent\_migrate\_raced} \\
+		This counts extents that the move path attempted to move but no
+		longer existed when doing the final btree update.
+\end{description}
+
+\subsubsection{Unit and performance tests}
+
+Echoing into \texttt{/sys/fs/bcachefs/<uuid>/perf\_test} runs various low level
+btree tests, some intended as unit tests and others as performance tests. The
+syntax is
+\begin{quote} \begin{verbatim}
+	echo <test_name> <nr_iterations> <nr_threads> > perf_test
+\end{verbatim} \end{quote}
+
+When complete, the elapsed time will be printed in the dmesg log. The full list
+of tests that can be run can be found near the bottom of
+\texttt{fs/bcachefs/tests.c}.
+
+\subsection{Debugfs interface}
+
+The contents of every btree, as well as various internal per-btree-node
+information, are available under \texttt{/sys/kernel/debug/bcachefs/<uuid>/}.
+
+For every btree, we have the following files:
+
+\begin{description}
+	\item \textit{btree\_name} \\
+		Entire btree contents, one key per line
+
+	\item \textit{btree\_name}\texttt{-formats} \\
+		Information about each btree node: the size of the packed bkey
+		format, how full each btree node is, number of packed and
+		unpacked keys, and number of nodes and failed nodes in the
+		in-memory search trees.
+
+	\item \textit{btree\_name}\texttt{-bfloat-failed} \\
+		For each sorted set of keys in a btree node, we construct a
+		binary search tree in eytzinger layout with compressed keys.
+		Sometimes we aren't able to construct a correct compressed
+		search key, which results in slower lookups; this file lists the
+		keys that resulted in these failed nodes.
+\end{description}
+
+\subsection{Listing and dumping filesystem metadata}
+
+\subsubsection{bcachefs show-super}
+
+This subcommand is used for examining and printing bcachefs superblocks. It
+takes two optional parameters:
+\begin{description}
+	\item \texttt{-l}: Print superblock layout, which records the amount of
+		space reserved for the superblock and the locations of the
+		backup superblocks.
+	\item \texttt{-f, --fields=(fields)}: List of superblock sections to
+		print, \texttt{all} to print all sections.
+\end{description}
+
+\subsubsection{bcachefs list}
+
+This subcommand gives access to the same functionality as the debugfs interface,
+listing btree nodes and contents, but for offline filesystems.
+
+\subsubsection{bcachefs list\_journal}
+
+This subcommand lists the contents of the journal, which primarily records btree
+updates ordered by when they occured.
+
+\subsubsection{bcachefs dump}
+
+This subcommand can dump all metadata in a filesystem (including multi device
+filesystems) as qcow2 images: when encountering issues that \texttt{fsck} can
+not recover from and need attention from the developers, this makes it possible
+to send the developers only the required metadata. Encrypted filesystems must
+first be unlocked with \texttt{bcachefs remove-passphrase}.
+
+\section{ioctl interface}
+
+This section documents bcachefs-specific ioctls:
+
+\begin{description}
+	\item \texttt{BCH\_IOCTL\_QUERY\_UUID} \\
+		Returs the UUID of the filesystem: used to find the sysfs
+		directory given a path to a mounted filesystem.
+
+	\item \texttt{BCH\_IOCTL\_FS\_USAGE} \\
+		Queries filesystem usage, returning global counters and a list
+		of counters by \texttt{bch\_replicas} entry.
+
+	\item \texttt{BCH\_IOCTL\_DEV\_USAGE} \\
+		Queries usage for a particular device, as bucket and sector
+		counts broken out by data type.
+
+	\item \texttt{BCH\_IOCTL\_READ\_SUPER} \\
+		Returns the filesystem superblock, and optionally the superblock
+		for a particular device given that device's index.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_ADD} \\
+		Given a path to a device, adds it to a mounted and running
+		filesystem. The device must already have a bcachefs superblock;
+		options and parameters are read from the new device's superblock
+		and added to the member info section of the existing
+		filesystem's superblock.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_REMOVE} \\
+		Given a path to a device or a device index, attempts to remove
+		it from a mounted and running filesystem. This operation
+		requires walking the btree to remove all references to this
+		device, and may fail if data would become degraded or lost,
+		unless appropriate force flags are set.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_ONLINE} \\
+		Given a path to a device that is a member of a running
+		filesystem (in degraded mode), brings it back online.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_OFFLINE} \\
+		Given a path or device index of a device in a multi device
+		filesystem, attempts to close it without removing it, so that
+		the device may be re-added later and the contents will still be
+		available.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_SET\_STATE} \\
+		Given a path or device index of a device in a multi device
+		filesystem, attempts to set its state to one of read-write,
+		read-only, failed or spare. Takes flags to force if the
+		filesystem would become degraded.
+
+	\item \texttt{BCH\_IOCTL\_DISK\_GET\_IDX} \\
+	\item \texttt{BCH\_IOCTL\_DISK\_RESIZE} \\
+	\item \texttt{BCH\_IOCTL\_DISK\_RESIZE\_JOURNAL} \\
+	\item \texttt{BCH\_IOCTL\_DATA} \\
+		Starts a data job, which walks all data and/or metadata in a
+		filesystem performing, performaing some operation on each btree
+		node and extent. Returns a file descriptor which can be read
+		from to get the current status of the job, and closing the file
+		descriptor (i.e. on process exit stops the data job.
+
+	\item \texttt{BCH\_IOCTL\_SUBVOLUME\_CREATE} \\
+	\item \texttt{BCH\_IOCTL\_SUBVOLUME\_DESTROY} \\
+	\item \texttt{BCHFS\_IOC\_REINHERIT\_ATTRS} \\
+\end{description}
+
+\section{On disk format}
+
+\subsection{Superblock}
+
+The superblock is the first thing to be read when accessing a bcachefs
+filesystem. It is located 4kb from the start of the device, with redundant
+copies elsewhere - typically one immediately after the first superblock, and one
+at the end of the device.
+
+The \texttt{bch\_sb\_layout} records the amount of space reserved for the
+superblock as well as the locations of all the superblocks. It is included with
+every superblock, and additionally written 3584 bytes from the start of the
+device (512 bytes before the first superblock).
+
+Most of the superblock is identical across each device. The exceptions are the
+\texttt{dev\_idx} field, and the journal section which gives the location of the
+journal.
+
+The main section of the superblock contains UUIDs, version numbers, number of
+devices within the filesystem and device index, block size, filesystem creation
+time, and various options and settings. The superblock also has a number of
+variable length sections:
+
+\begin{description}
+	\item \texttt{BCH\_SB\_FIELD\_journal} \\
+		List of buckets used for the journal on this device.
+
+	\item \texttt{BCH\_SB\_FIELD\_members} \\
+		List of member devices, as well as per-device options and
+		settings, including bucket size, number of buckets and time when
+		last mounted.
+
+	\item \texttt{BCH\_SB\_FIELD\_crypt} \\
+		Contains the main chacha20 encryption key, encrypted by the
+		user's passphrase, as well as key derivation function settings.
+
+	\item \texttt{BCH\_SB\_FIELD\_replicas} \\
+		Contains a list of replica entries, which are lists of devices
+		that have extents replicated across them. 
+
+	\item \texttt{BCH\_SB\_FIELD\_quota} \\
+		Contains timelimit and warnlimit fields for each quota type
+		(user, group and project) and counter (space, inodes).
+
+	\item \texttt{BCH\_SB\_FIELD\_disk\_groups} \\
+		Formerly referred to as disk groups (and still is throughout the
+		code); this section contains device label strings and records
+		the tree structure of label paths, allowing a label once parsed
+		to be referred to by integer ID by the target options.
+
+	\item \texttt{BCH\_SB\_FIELD\_clean} \\
+		When the filesystem is clean, this section contains a list of
+		journal entries that are normally written with each journal
+		write (\texttt{struct jset}): btree roots, as well as filesystem
+		usage and read/write counters (total amount of data read/written
+		to this filesystem). This allows reading the journal to be
+		skipped after clean shutdowns.
+\end{description}
+
+\subsection{Journal}
+
+Every journal write (\texttt{struct jset}) contains a list of entries:
+\texttt{struct jset\_entry}. Below are listed the various journal entry types.
+
+\begin{description}
+	\item \texttt{BCH\_JSET\_ENTRY\_btree\_key} \\
+		This entry type is used to record every btree update that
+		happens. It contains one or more btree keys (\texttt{struct
+		bkey}), and the \texttt{btree\_id} and \texttt{level} fields of
+		\texttt{jset\_entry} record the btree ID and level the key
+		belongs to.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_btree\_root} \\
+		This entry type is used for pointers btree roots. In the current
+		implementation, every journal write still records every btree
+		root, although that is subject to change. A btree root is a bkey
+		of type \texttt{KEY\_TYPE\_btree\_ptr\_v2}, and the btree\_id
+		and level fields of \texttt{jset\_entry} record the btree ID and
+		depth.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_clock} \\
+		Records IO time, not wall clock time - i.e. the amount of reads
+		and writes, in 512 byte sectors since the filesystem was
+		created.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_usage} \\
+		Used for certain persistent counters: number of inodes, current
+		maximum key version, and sectors of persistent reservations.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_data\_usage} \\
+		Stores replica entries with a usage counter, in sectors.
+
+	\item \texttt{BCH\_JSET\_ENTRY\_dev\_usage} \\
+		Stores usage counters for each device: sectors used and buckets
+		used, broken out by each data type.
+\end{description}
+
+\subsection{Btrees}
+
+\subsection{Btree keys}
+
+\begin{description}
+	\item \texttt{KEY\_TYPE\_deleted}
+	\item \texttt{KEY\_TYPE\_whiteout}
+	\item \texttt{KEY\_TYPE\_error}
+	\item \texttt{KEY\_TYPE\_cookie}
+	\item \texttt{KEY\_TYPE\_hash\_whiteout}
+	\item \texttt{KEY\_TYPE\_btree\_ptr}
+	\item \texttt{KEY\_TYPE\_extent}
+	\item \texttt{KEY\_TYPE\_reservation}
+	\item \texttt{KEY\_TYPE\_inode}
+	\item \texttt{KEY\_TYPE\_inode\_generation}
+	\item \texttt{KEY\_TYPE\_dirent}
+	\item \texttt{KEY\_TYPE\_xattr}
+	\item \texttt{KEY\_TYPE\_alloc}
+	\item \texttt{KEY\_TYPE\_quota}
+	\item \texttt{KEY\_TYPE\_stripe}
+	\item \texttt{KEY\_TYPE\_reflink\_p}
+	\item \texttt{KEY\_TYPE\_reflink\_v}
+	\item \texttt{KEY\_TYPE\_inline\_data}
+	\item \texttt{KEY\_TYPE\_btree\_ptr\_v2}
+	\item \texttt{KEY\_TYPE\_indirect\_inline\_data}
+	\item \texttt{KEY\_TYPE\_alloc\_v2}
+	\item \texttt{KEY\_TYPE\_subvolume}
+	\item \texttt{KEY\_TYPE\_snapshot}
+	\item \texttt{KEY\_TYPE\_inode\_v2}
+	\item \texttt{KEY\_TYPE\_alloc\_v3}
+\end{description}
+
+\end{document}
diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c
index 537ab79..b1385a7 100644
--- a/libbcachefs/bkey_sort.c
+++ b/libbcachefs/bkey_sort.c
@@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 	return nr;
 }
 
-static void extent_sort_append(struct bch_fs *c,
-			       struct bkey_format *f,
-			       struct btree_nr_keys *nr,
-			       struct bkey_packed **out,
-			       struct bkey_s k)
-{
-	if (!bkey_deleted(k.k)) {
-		if (!bch2_bkey_pack_key(*out, k.k, f))
-			memcpy_u64s_small(*out, k.k, BKEY_U64s);
-
-		memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
-
-		btree_keys_account_key_add(nr, 0, *out);
-		*out = bkey_next(*out);
-	}
-}
-
 /* Sort + repack in a new format: */
 struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -144,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	struct bkey_format *in_f = &src->format;
 	struct bkey_packed *in, *out = vstruct_last(dst);
 	struct btree_nr_keys nr;
+	bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
 
 	memset(&nr, 0, sizeof(nr));
 
@@ -151,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 		if (filter_whiteouts && bkey_deleted(in))
 			continue;
 
-		if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-				       ? in_f : &bch2_bkey_format_current, in))
+		if (!transform)
+			bkey_copy(out, in);
+		else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+					     ? in_f : &bch2_bkey_format_current, in))
 			out->format = KEY_FORMAT_LOCAL_BTREE;
 		else
 			bch2_bkey_unpack(src, (void *) out, in);
@@ -165,47 +151,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
 	return nr;
 }
 
-/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *c,
-		       struct bset *dst, struct btree *src,
-		       struct btree_node_iter *iter,
-		       struct bkey_format *out_f,
-		       bool filter_whiteouts)
-{
-	struct bkey_packed *out = vstruct_last(dst), *k_packed;
-	struct bkey_buf k;
-	struct btree_nr_keys nr;
-
-	memset(&nr, 0, sizeof(nr));
-	bch2_bkey_buf_init(&k);
-
-	while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-		if (filter_whiteouts && bkey_deleted(k_packed))
-			continue;
-
-		/*
-		 * NOTE:
-		 * bch2_bkey_normalize may modify the key we pass it (dropping
-		 * stale pointers) and we don't have a write lock on the src
-		 * node; we have to make a copy of the entire key before calling
-		 * normalize
-		 */
-		bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
-		bch2_bkey_unpack(src, k.k, k_packed);
-
-		if (filter_whiteouts &&
-		    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
-			continue;
-
-		extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
-	}
-
-	dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-	bch2_bkey_buf_exit(&k, c);
-	return nr;
-}
-
 static inline int sort_keys_cmp(struct btree *b,
 				struct bkey_packed *l,
 				struct bkey_packed *r)
diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h
index 1059996..79cf11d 100644
--- a/libbcachefs/bkey_sort.h
+++ b/libbcachefs/bkey_sort.h
@@ -37,11 +37,6 @@ struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
 		 struct btree_node_iter *,
 		 struct bkey_format *, bool);
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *,
-		       struct bset *, struct btree *,
-		       struct btree_node_iter *,
-		       struct bkey_format *, bool);
 
 unsigned bch2_sort_keys(struct bkey_packed *,
 			struct sort_iter *, bool);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index c19c3ac..9b22c5e 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -391,16 +391,10 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
 	bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-	if (btree_node_is_extents(src))
-		nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
-	else
-		nr = bch2_sort_repack(btree_bset_first(dst),
-				src, &src_iter,
-				&dst->format,
-				true);
+	nr = bch2_sort_repack(btree_bset_first(dst),
+			src, &src_iter,
+			&dst->format,
+			true);
 
 	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
 			       start_time);
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index fc9d5ba..bdbb900 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -155,11 +155,19 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 	 * goes to 0, and it's safe because we have the node intent
 	 * locked:
 	 */
-	atomic64_sub(__SIX_VAL(read_lock, readers),
-		     &b->c.lock.state.counter);
+	if (!b->c.lock.readers)
+		atomic64_sub(__SIX_VAL(read_lock, readers),
+			     &b->c.lock.state.counter);
+	else
+		this_cpu_sub(*b->c.lock.readers, readers);
+
 	btree_node_lock_type(trans->c, b, SIX_LOCK_write);
-	atomic64_add(__SIX_VAL(read_lock, readers),
-		     &b->c.lock.state.counter);
+
+	if (!b->c.lock.readers)
+		atomic64_add(__SIX_VAL(read_lock, readers),
+			     &b->c.lock.state.counter);
+	else
+		this_cpu_add(*b->c.lock.readers, readers);
 }
 
 bool __bch2_btree_node_relock(struct btree_trans *trans,
@@ -369,19 +377,16 @@ bool __bch2_btree_node_lock(struct btree_trans *trans,
 	if (six_trylock_type(&b->c.lock, type))
 		return true;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking_path_idx = path->idx;
 	trans->locking_pos	= pos;
 	trans->locking_btree_id	= path->btree_id;
 	trans->locking_level	= level;
 	trans->locking		= b;
-#endif
 
 	ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans->locking = NULL;
-#endif
+
 	if (ret)
 		bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
 				       start_time);
@@ -2796,12 +2801,10 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
 	trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	trans->pid = current->pid;
 	mutex_lock(&c->btree_trans_lock);
 	list_add(&trans->list, &c->btree_trans_list);
 	mutex_unlock(&c->btree_trans_lock);
-#endif
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -2840,11 +2843,9 @@ void bch2_trans_exit(struct btree_trans *trans)
 
 	check_btree_paths_leaked(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 	mutex_lock(&c->btree_trans_lock);
 	list_del(&trans->list);
 	mutex_unlock(&c->btree_trans_lock);
-#endif
 
 	srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
@@ -2888,7 +2889,6 @@ bch2_btree_path_node_to_text(struct printbuf *out,
 	bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 static bool trans_has_locks(struct btree_trans *trans)
 {
 	struct btree_path *path;
@@ -2898,11 +2898,9 @@ static bool trans_has_locks(struct btree_trans *trans)
 			return true;
 	return false;
 }
-#endif
 
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct btree_trans *trans;
 	struct btree_path *path;
 	struct btree *b;
@@ -2956,7 +2954,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 		}
 	}
 	mutex_unlock(&c->btree_trans_lock);
-#endif
 }
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 2c2e2f7..22dbbe3 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -366,7 +366,6 @@ struct btree_trans_commit_hook {
 
 struct btree_trans {
 	struct bch_fs		*c;
-#ifdef CONFIG_BCACHEFS_DEBUG
 	struct list_head	list;
 	struct btree		*locking;
 	unsigned		locking_path_idx;
@@ -374,7 +373,6 @@ struct btree_trans {
 	u8			locking_btree_id;
 	u8			locking_level;
 	pid_t			pid;
-#endif
 	unsigned long		ip;
 	int			srcu_idx;
 
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 295942e..95d1988 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -1271,22 +1271,24 @@ err:
  * When deleting, check if we need to emit a whiteout (because we're overwriting
  * something in an ancestor snapshot)
  */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-				      enum btree_id btree_id, struct bpos pos)
+static int need_whiteout_for_snapshot(struct btree_trans *trans, struct btree_iter *orig)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
-	u32 snapshot = pos.snapshot;
+	u32 snapshot = orig->pos.snapshot;
 	int ret;
 
-	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+	if (!bch2_snapshot_parent(trans->c, snapshot))
 		return 0;
 
-	pos.snapshot++;
+	bch2_trans_copy_iter(&iter, orig);
+	iter.flags &= BTREE_ITER_FILTER_SNAPSHOTS;
+	iter.flags |= BTREE_ITER_ALL_SNAPSHOTS;
 
-	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-			   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-		if (bkey_cmp(k.k->p, pos))
+	bch2_btree_iter_advance(&iter);
+
+	for_each_btree_key_continue_norestart(iter, 0, k, ret) {
+		if (bkey_cmp(k.k->p, orig->pos))
 			break;
 
 		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
@@ -1312,6 +1314,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 
 	BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
 	BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+	BUG_ON(bpos_cmp(k->k.p, iter->pos));
 
 	n = (struct btree_insert_entry) {
 		.flags		= flags,
@@ -1332,7 +1335,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
 
 	if (bkey_deleted(&n.k->k) &&
 	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-		int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p);
+		int ret = need_whiteout_for_snapshot(trans, iter);
 		if (unlikely(ret < 0))
 			return ret;
 
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 4ad843f..9cdd03f 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -2705,7 +2705,8 @@ int bch2_truncate(struct user_namespace *mnt_userns,
 			U64_MAX, &i_sectors_delta);
 	i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-	BUG_ON(!inode->v.i_size && inode->v.i_blocks);
+	WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
+		!bch2_journal_error(&c->journal));
 
 	if (unlikely(ret))
 		goto err;
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index d5d32bf..3f51eda 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -262,21 +262,6 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 	return ret;
 }
 
-static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
-{
-	struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
-
-	if (!fs_usage)
-		return -ENOMEM;
-
-	bch2_fs_usage_to_text(out, c, fs_usage);
-
-	percpu_up_read(&c->mark_lock);
-
-	kfree(fs_usage);
-	return 0;
-}
-
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct btree_trans trans;
@@ -386,9 +371,6 @@ SHOW(bch2_fs)
 
 	/* Debugging: */
 
-	if (attr == &sysfs_alloc_debug)
-		return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
-
 	if (attr == &sysfs_journal_debug) {
 		bch2_journal_debug_to_text(&out, &c->journal);
 		return out.pos - buf;
@@ -580,7 +562,6 @@ STORE(bch2_fs_internal)
 SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
-	&sysfs_alloc_debug,
 	&sysfs_journal_debug,
 	&sysfs_journal_pins,
 	&sysfs_btree_updates,
@@ -588,17 +569,21 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
 	&sysfs_btree_transactions,
+	&sysfs_new_stripes,
 	&sysfs_stripes_heap,
 	&sysfs_open_buckets,
+	&sysfs_io_timers_read,
+	&sysfs_io_timers_write,
+
+	&sysfs_trigger_journal_flush,
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
 
 	&sysfs_read_realloc_races,
 	&sysfs_extent_migrate_done,
 	&sysfs_extent_migrate_raced,
 
-	&sysfs_trigger_journal_flush,
-	&sysfs_trigger_gc,
 	&sysfs_gc_gens_pos,
-	&sysfs_prune_cache,
 
 	&sysfs_copy_gc_enabled,
 	&sysfs_copy_gc_wait,
@@ -607,11 +592,6 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_rebalance_work,
 	sysfs_pd_controller_files(rebalance),
 
-	&sysfs_new_stripes,
-
-	&sysfs_io_timers_read,
-	&sysfs_io_timers_write,
-
 	&sysfs_data_op_data_progress,
 
 	&sysfs_internal_uuid,
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
index dfd8c43..478c00a 100644
--- a/libbcachefs/tests.c
+++ b/libbcachefs/tests.c
@@ -14,12 +14,14 @@ static void delete_test_keys(struct bch_fs *c)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-				      POS(0, 0), POS(0, U64_MAX),
+				      SPOS(0, 0, U32_MAX),
+				      SPOS(0, U64_MAX, U32_MAX),
 				      NULL);
 	BUG_ON(ret);
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS(0, 0), POS(0, U64_MAX),
+				      SPOS(0, 0, U32_MAX),
+				      SPOS(0, U64_MAX, U32_MAX),
 				      NULL);
 	BUG_ON(ret);
 }
@@ -541,10 +543,11 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
 	u64 i;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
-		bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
+		bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
 		k = bch2_btree_iter_peek(&iter);
 		ret = bkey_err(k);
@@ -567,7 +570,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret;
 
-	bch2_btree_iter_set_pos(iter, POS(0, pos));
+	bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
 
 	k = bch2_btree_iter_peek(iter);
 	ret = bkey_err(k);
@@ -594,7 +597,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr)
 	u64 i, rand;
 
 	bch2_trans_init(&trans, c, 0, 0);
-	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
+	bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+			     SPOS(0, 0, U32_MAX), 0);
 
 	for (i = 0; i < nr; i++) {
 		rand = test_rand();
@@ -673,7 +677,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
 		insert.k.p = iter.pos;
 
@@ -703,7 +707,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX), 0, k, ret)
 		;
 	bch2_trans_iter_exit(&trans, &iter);
 
@@ -720,7 +725,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
 
 	bch2_trans_init(&trans, c, 0, 0);
 
-	for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+	for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+			   SPOS(0, 0, U32_MAX),
 			   BTREE_ITER_INTENT, k, ret) {
 		struct bkey_i_cookie u;
 
@@ -745,8 +751,7 @@ static int seq_delete(struct bch_fs *c, u64 nr)
 	int ret;
 
 	ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-				      POS(0, 0), POS(0, U64_MAX),
-				      NULL);
+				      SPOS(0, 0, U32_MAX), POS_MAX, NULL);
 	if (ret)
 		bch_err(c, "error in seq_delete: %i", ret);
 	return ret;