From: Kent Overstreet Date: Mon, 20 Dec 2021 00:37:29 +0000 (-0500) Subject: Update bcachefs sources to ff3a76e1af bcachefs: Change need_whiteout_for_snapshot... X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d06f5690fab526c4a4a8a4d55f9c4e675d883be9;p=bcachefs-tools-debian Update bcachefs sources to ff3a76e1af bcachefs: Change need_whiteout_for_snapshot() to clone iterator --- diff --git a/.bcachefs_revision b/.bcachefs_revision index d0a5221..4bae87b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -635ca475f4f40ddcb2976f8f20a89df4c574aa22 +ff3a76e1af04f51506f45e0f71d53f7e6dd51a75 diff --git a/.gitignore b/.gitignore index 8feb598..b1c03cd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ tests/__pycache__/ mount/target mount.bcachefs -doc/bcachefs.5.rst +bcachefs-principles-of-operation.* diff --git a/Makefile b/Makefile index e94419f..67c40e5 100644 --- a/Makefile +++ b/Makefile @@ -28,15 +28,6 @@ PYTEST_CMD?=$(shell \ ) PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS) -RST2MAN_ARGS?= -RST2MAN_CMD?=$(shell \ - command -v rst2man \ - || which rst2man \ - || command -v rst2man.py \ - || which rst2man.py \ -) -RST2MAN:=$(RST2MAN_CMD) $(RST2MAN_ARGS) - CARGO_ARGS= CARGO=cargo $(CARGO_ARGS) CARGO_PROFILE=release @@ -108,18 +99,6 @@ TAGS: tags: ctags -R . -DOCSRC := opts_macro.h bcachefs.5.rst.tmpl -DOCGENERATED := bcachefs.5 doc/bcachefs.5.rst -DOCDEPS := $(addprefix ./doc/,$(DOCSRC)) -bcachefs.5: $(DOCDEPS) libbcachefs/opts.h -ifneq (,$(RST2MAN_CMD)) - $(CC) doc/opts_macro.h -I libbcachefs -I include -E 2>/dev/null \ - | doc/macro2rst.py - $(RST2MAN) doc/bcachefs.5.rst bcachefs.5 -else - @echo "WARNING: no rst2man found! Man page not generated." -endif - SRCS=$(shell find . -type f -iname '*.c') DEPS=$(SRCS:.c=.d) -include $(DEPS) @@ -184,6 +163,11 @@ clean: deb: all debuild -us -uc -nc -b -i -I +bcachefs-principles-of-operation.pdf: bcachefs-principles-of-operation.tex + pdflatex bcachefs-principles-of-operation.tex && pdflatex bcachefs-principles-of-operation.tex + +doc: bcachefs-principles-of-operation.pdf + .PHONY: update-bcachefs-sources update-bcachefs-sources: git rm -rf --ignore-unmatch libbcachefs diff --git a/bcachefs-principles-of-operation.tex b/bcachefs-principles-of-operation.tex new file mode 100644 index 0000000..d5ac6ed --- /dev/null +++ b/bcachefs-principles-of-operation.tex @@ -0,0 +1,1188 @@ +\documentclass{article} + +\usepackage{imakeidx} +\usepackage[pdfborder={0 0 0}]{hyperref} +\usepackage{longtable} + +\title{bcachefs: Principles of Operation} +\author{Kent Overstreet} + +\date{} + +\begin{document} + +\maketitle +\tableofcontents + +\section{Introduction and overview} + +Bcachefs is a modern, general purpose, copy on write filesystem descended from +bcache, a block layer cache. + +The internal architecture is very different from most existing filesystems where +the inode is central and many data structures hang off of the inode. Instead, +bcachefs is architected more like a filesystem on top of a relational database, +with tables for the different filesystem data types - extents, inodes, dirents, +xattrs, et cetera. + +bcachefs supports almost all of the same features as other modern COW +filesystems, such as ZFS and btrfs, but in general with a cleaner, simpler, +higher performance design. + +\subsection{Performance overview} + +The core of the architecture is a very high performance and very low latency b+ +tree, which also is not a conventional b+ tree but more of hybrid, taking +concepts from compacting data structures: btree nodes are very large, log +structured, and compacted (resorted) as necessary in memory. This means our b+ +trees are very shallow compared to other filesystems. + +What this means for the end user is that since we require very few seeks or disk +reads, filesystem latency is extremely good - especially cache cold filesystem +latency, which does not show up in most benchmarks but has a huge impact on real +world performance, as well as how fast the system "feels" in normal interactive +usage. Latency has been a major focus throughout the codebase - notably, we have +assertions that we never hold b+ tree locks while doing IO, and the btree +transaction layer makes it easily to aggressively drop and retake locks as +needed - one major goal of bcachefs is to be the first general purpose soft +realtime filesystem. + +Additionally, unlike other COW btrees, btree updates are journalled. This +greatly improves our write efficiency on random update workloads, as it means +btree writes are only done when we have a large block of updates, or when +required by memory reclaim or journal reclaim. + +\subsection{Bucket based allocation} + +As mentioned bcachefs is descended from bcache, where the ability to efficiently +invalidate cached data and reuse disk space was a core design requirement. To +make this possible the allocator divides the disk up into buckets, typically +512k to 2M but possibly larger or smaller. Buckets and data pointers have +generation numbers: we can reuse a bucket with cached data in it without finding +and deleting all the data pointers by incrementing the generation number. + +In keeping with the copy-on-write theme of avoiding update in place wherever +possible, we never rewrite or overwrite data within a bucket - when we allocate +a bucket, we write to it sequentially and then we don't write to it again until +the bucket has been invalidated and the generation number incremented. + +This means we require a copying garbage collector to deal with internal +fragmentation, when patterns of random writes leave us with many buckets that +are partially empty (because the data they contained was overwritten) - copy GC +evacuates buckets that are mostly empty by writing the data they contain to new +buckets. This also means that we need to reserve space on the device for the +copy GC reserve when formatting - typically 8\% or 12\%. + +There are some advantages to structuring the allocator this way, besides being +able to support cached data: +\begin{itemize} + \item By maintaining multiple write points that are writing to different buckets, + we're able to easily and naturally segregate unrelated IO from different + processes, which helps greatly with fragmentation. + + \item The fast path of the allocator is essentially a simple bump allocator - the + disk space allocation is extremely fast + + \item Fragmentation is generally a non issue unless copygc has to kick + in, and it usually doesn't under typical usage patterns. The + allocator and copygc are doing essentially the same things as + the flash translation layer in SSDs, but within the filesystem + we have much greater visibility into where writes are coming + from and how to segregate them, as well as which data is + actually live - performance is generally more predictable than + with SSDs under similar usage patterns. + + \item The same algorithms will in the future be used for managing SMR + hard drives directly, avoiding the translation layer in the hard + drive - doing this work within the filesystem should give much + better performance and much more predictable latency. +\end{itemize} + +\section{Feature overview} + +\subsection{IO path options} + +Most options that control the IO path can be set at either the filesystem level +or on individual inodes (files and directories). When set on a directory via the +\texttt{bcachefs attr} command, they will be automatically applied recursively. + +\subsubsection{Checksumming} + +bcachefs supports both metadata and data checksumming - crc32c by default, but +stronger checksums are available as well. Enabling data checksumming incurs some +performance overhead - besides the checksum calculation, writes have to be +bounced for checksum stability (Linux generally cannot guarantee that the buffer +being written is not modified in flight), but reads generally do not have to be +bounced. + +Checksum granularity in bcachefs is at the level of individual extents, which +results in smaller metadata but means we have to read entire extents in order to +verify the checksum. By default, checksummed and compressed extents are capped +at 64k. For most applications and usage scenarios this is an ideal trade off, but +small random \texttt{O\_DIRECT} reads will incur significant overhead. In the +future, checksum granularity will be a per-inode option. + +\subsubsection{Encryption} + +bcachefs supports authenticated (AEAD style) encryption - ChaCha20/Poly1305. +When encryption is enabled, the poly1305 MAC replaces the normal data and +metadata checksums. This style of encryption is superior to typical block layer +or filesystem level encryption (usually AES-XTS), which only operates on blocks +and doesn't have a way to store nonces or MACs. In contrast, we store a nonce +and cryptographic MAC alongside data pointers - meaning we have a chain of trust +up to the superblock (or journal, in the case of unclean shutdowns) and can +definitely tell if metadata has been modified, dropped, or replaced with an +earlier version - replay attacks are not possible. + +Encryption can only be specified for the entire filesystem, not per file or +directory - this is because metadata blocks do not belong to a particular file. +All metadata except for the superblock is encrypted. + +In the future we'll probably add AES-GCM for platforms that have hardware +acceleration for AES, but in the meantime software implementations of ChaCha20 +are also quite fast on most platforms. + +\texttt{scrypt} is used for the key derivation function - for converting the +user supplied passphrase to an encryption key. + +To format a filesystem with encryption, use +\begin{quote} \begin{verbatim} +bcachefs format --encrypted /dev/sda1 +\end{verbatim} \end{quote} + +You will be prompted for a passphrase. Then, to use an encrypted filesystem +use the command +\begin{quote} \begin{verbatim} +bcachefs unlock /dev/sda1 +\end{verbatim} \end{quote} + +You will be prompted for the passphrase and the encryption key will be added to +your in-kernel keyring; mount, fsck and other commands will then work as usual. + +The passphrase on an existing encrypted filesystem can be changed with the +\texttt{bcachefs set-passphrase} command. To permanently unlock an encrypted +filesystem, use the \texttt{bcachefs remove-passphrase} command - this can be +useful when dumping filesystem metadata for debugging by the developers. + +There is a \texttt{wide\_macs} option which controls the size of the +cryptographic MACs stored on disk. By default, only 80 bits are stored, which +should be sufficient security for most applications. With the +\texttt{wide\_macs} option enabled we store the full 128 bit MAC, at the cost of +making extents 8 bytes bigger. + +\subsubsection{Compression} + +bcachefs supports gzip, lz4 and zstd compression. As with data checksumming, we +compress entire extents, not individual disk blocks - this gives us better +compression ratios than other filesystems, at the cost of reduced small random +read performance. + +Data can also be compressed or recompressed with a different algorithm in the +background by the rebalance thread, if the \texttt{background\_compression} +option is set. + +\subsection{Multiple devices} + +bcachefs is a multi-device filesystem. Devices need not be the same size: by +default, the allocator will stripe across all available devices but biasing in +favor of the devices with more free space, so that all devices in the filesystem +fill up at the same rate. Devices need not have the same performance +characteristics: we track device IO latency and direct reads to the device that +is currently fastest. + +\subsubsection{Replication} + +bcachefs supports standard RAID1/10 style redundancy with the +\texttt{data\_replicas} and \texttt{metadata\_replicas} options. Layout is not +fixed as with RAID10: a given extent can be replicated across any set of +devices; the \texttt{bcachefs fs usage} command shows how data is replicated +within a filesystem. + +\subsubsection{Erasure coding} + +bcachefs also supports Reed-Solomon erasure coding - the same algorithm used by +most RAID5/6 implementations) When enabled with the \texttt{ec} option, the +desired redundancy is taken from the \texttt{data\_replicas} option - erasure +coding of metadata is not supported. + +Erasure coding works significantly differently from both conventional RAID +implementations and other filesystems with similar features. In conventional +RAID, the "write hole" is a significant problem - doing a small write within a +stripe requires the P and Q (recovery) blocks to be updated as well, and since +those writes cannot be done atomically there is a window where the P and Q +blocks are inconsistent - meaning that if the system crashes and recovers with a +drive missing, reconstruct reads for unrelated data within that stripe will be +corrupted. + +ZFS avoids this by fragmenting individual writes so that every write becomes a +new stripe - this works, but the fragmentation has a negative effect on +performance: metadata becomes bigger, and both read and write requests are +excessively fragmented. Btrfs's erasure coding implementation is more +conventional, and still subject to the write hole problem. + +bcachefs's erasure coding takes advantage of our copy on write nature - since +updating stripes in place is a problem, we simply don't do that. And since +excessively small stripes is a problem for fragmentation, we don't erasure code +individual extents, we erasure code entire buckets - taking advantage of bucket +based allocation and copying garbage collection. + +When erasure coding is enabled, writes are initially replicated, but one of the +replicas is allocated from a bucket that is queued up to be part of a new +stripe. When we finish filling up the new stripe, we write out the P and Q +buckets and then drop the extra replicas for all the data within that stripe - +the effect is similar to full data journalling, and it means that after erasure +coding is done the layout of our data on disk is ideal. + +Since disks have write caches that are only flushed when we issue a cache flush +command - which we only do on journal commit - if we can tweak the allocator so +that the buckets used for the extra replicas are reused (and then overwritten +again) immediately, this full data journalling should have negligible overhead - +this optimization is not implemented yet, however. + +\subsubsection{Device labels and targets} + +By default, writes are striped across all devices in a filesystem, but they may +be directed to a specific device or set of devices with the various target +options. The allocator only prefers to allocate from devices matching the +specified target; if those devices are full, it will fall back to allocating +from any device in the filesystem. + +Target options may refer to a device directly, e.g. +\texttt{foreground\_target=/dev/sda1}, or they may refer to a device label. A +device label is a path delimited by periods - e.g. ssd.ssd1 (and labels need not +be unique). This gives us ways of referring to multiple devices in target +options: If we specify ssd in a target option, that will refer to all devices +with the label ssd or labels that start with ssd. (e.g. ssd.ssd1, ssd.ssd2). + +Four target options exist. These options all may be set at the filesystem level +(at format time, at mount time, or at runtime via sysfs), or on a particular +file or directory: + +\begin{description} + \item \texttt{foreground\_target}: normal foreground data writes, and + metadata if \\ \texttt{metadata\_target} is not set + \item \texttt{metadata\_target}: btree writes + \item \texttt{background\_target}: If set, user data (not metadata) will + be moved to this target in the background + \item\texttt{promote\_target}: If set, a cached copy will be added to + this target on read, if none exists +\end{description} + +\subsubsection{Caching} + +When an extent has multiple copies on different devices, some of those copies +may be marked as cached. Buckets containing only cached data are discarded as +needed by the allocator in LRU order. + +When data is moved from one device to another according to the \\ +\texttt{background\_target} option, the original copy is left in place but +marked as cached. With the \texttt{promote\_target} option, the original copy is +left unchanged and the new copy on the \texttt{promote\_target} device is marked +as cached. + +To do writeback caching, set \texttt{foreground\_target} and +\texttt{promote\_target} to the cache device, and \texttt{background\_target} to +the backing device. To do writearound caching, set \texttt{foreground\_target} +to the backing device and \texttt{promote\_target} to the cache device. + +\subsubsection{Durability} + +Some devices may be considered to be more reliable than others. For example, we +might have a filesystem composed of a hardware RAID array and several NVME flash +devices, to be used as cache. We can set replicas=2 so that losing any of the +NVME flash devices will not cause us to lose data, and then additionally we can +set durability=2 for the hardware RAID device to tell bcachefs that we don't +need extra replicas for data on that device - data on that device will count as +two replicas, not just one. + +The durability option can also be used for writethrough caching: by setting +durability=0 for a device, it can be used as a cache and only as a cache - +bcachefs won't consider copies on that device to count towards the number of +replicas we're supposed to keep. + +\subsection{Reflink} + +bcachefs supports reflink, similarly to other filesystems with the same feature. +cp --reflink will create a copy that shares the underlying storage. Reading from +that file will become slightly slower - the extent pointing to that data is +moved to the reflink btree (with a refcount added) and in the extents btree we +leave a key that points to the indirect extent in the reflink btree, meaning +that we now have to do two btree lookups to read from that data instead of just +one. + +\subsection{Inline data extents} + +bcachefs supports inline data extents, controlled by the \texttt{inline\_data} +option (on by default). When the end of a file is being written and is smaller +than half of the filesystem blocksize, it will be written as an inline data +extent. Inline data extents can also be reflinked (moved to the reflink btree +with a refcount added): as a todo item we also intend to support compressed +inline data extents. + +\subsection{Subvolumes and snapshots} + +bcachefs supports subvolumes and snapshots with a similar userspace interface as +btrfs. A new subvolume may be created empty, or it may be created as a snapshot +of another subvolume. Snapshots are writeable and may be snapshotted again, +creating a tree of snapshots. + +Snapshots are very cheap to create: they're not based on cloning of COW btrees +as with btrfs, but instead are based on versioning of individual keys in the +btrees. Many thousands or millions of snapshots can be created, with the only +limitation being disk space. + +The following subcommands exist for managing subvolumes and snapshots: +\begin{itemize} + \item \texttt{bcachefs subvolume create}: Create a new, empty subvolume + \item \texttt{bcachefs subvolume destroy}: Delete an existing subvolume + or snapshot + \item \texttt{bcachefs subvolume snapshot}: Create a snapshot of an + existing subvolume +\end{itemize} + +A subvolume can also be deleting with a normal rmdir after deleting all the +contents, as with \texttt{rm -rf}. Still to be implemented: read-only snapshots, +recursive snapshot creation, and a method for recursively listing subvolumes. + +\subsection{Quotas} + +bcachefs supports conventional user/group/project quotas. Quotas do not +currently apply to snapshot subvolumes, because if a file changes ownership in +the snapshot it would be ambiguous as to what quota data within that file +should be charged to. + +When a directory has a project ID set it is inherited automatically by +descendants on creation and rename. When renaming a directory would cause the +project ID to change we return -EXDEV so that the move is done file by file, so +that the project ID is propagated correctly to descendants - thus, project +quotas can be used as subdirectory quotas. + +\section{Management} + +\subsection{Formatting} + +To format a new bcachefs filesystem use the subcommand \texttt{bcachefs +format}, or \texttt{mkfs.bcachefs}. All persistent filesystem-wide options can +be specified at format time. For an example of a multi device filesystem with +compression, encryption, replication and writeback caching: +\begin{quote} \begin{verbatim} +bcachefs format --compression=lz4 \ + --encrypted \ + --replicas=2 \ + --label=ssd.ssd1 /dev/sda \ + --label=ssd.ssd2 /dev/sdb \ + --label=hdd.hdd1 /dev/sdc \ + --label=hdd.hdd2 /dev/sdd \ + --label=hdd.hdd3 /dev/sde \ + --label=hdd.hdd4 /dev/sdf \ + --foreground_target=ssd \ + --promote_target=ssd \ + --background_target=hdd +\end{verbatim} \end{quote} + +\subsection{Mounting} + +To mount a multi device filesystem, there are two options. You can specify all +component devices, separated by hyphens, e.g. +\begin{quote} \begin{verbatim} +mount -t bcachefs /dev/sda:/dev/sdb:/dev/sdc /mnt +\end{verbatim} \end{quote} +Or, use the mount.bcachefs tool to mount by filesystem UUID. Still todo: improve +the mount.bcachefs tool to support mounting by filesystem label. + +No special handling is needed for recovering from unclean shutdown. Journal +replay happens automatically, and diagnostic messages in the dmesg log will +indicate whether recovery was from clean or unclean shutdown. + +The \texttt{-o degraded} option will allow a filesystem to be mounted without +all the the devices, but will fail if data would be missing. The +\texttt{-o very\_degraded} can be used to attempt mounting when data would be +missing. + +Also relevant is the \texttt{-o nochanges} option. It disallows any and all +writes to the underlying devices, pinning dirty data in memory as necessary if +for example journal replay was necessary - think of it as a "super read-only" +mode. It can be used for data recovery, and for testing version upgrades. + +The \texttt{-o verbose} enables additional log output during the mount process. + +\subsection{Fsck} + +It is possible to run fsck either in userspace with the \texttt{bcachefs fsck} +subcommand (also available as \texttt{fsck.bcachefs}, or in the kernel while +mounting by specifying the \texttt{-o fsck} mount option. In either case the +exact same fsck implementation is being run, only the environment is different. +Running fsck in the kernel at mount time has the advantage of somewhat better +performance, while running in userspace has the ability to be stopped with +ctrl-c and can prompt the user for fixing errors. To fix errors while running +fsck in the kernel, use the \texttt{-o fix\_errors} option. + +The \texttt{-n} option passed to fsck implies the \texttt{-o nochanges} option; +\texttt{bcachefs fsck -ny} can be used to test filesystem repair in dry-run +mode. + +\subsection{Status of data} + +The \texttt{bcachefs fs usage} may be used to display filesystem usage broken +out in various ways. Data usage is broken out by type: superblock, journal, +btree, data, cached data, and parity, and by which sets of devices extents are +replicated across. We also give per-device usage which includes fragmentation +due to partially used buckets. + +\subsection{Journal} + +The journal has a number of tunables that affect filesystem performance. Journal +commits are fairly expensive operations as they require issuing FLUSH and FUA +operations to the underlying devices. By default, we issue a journal flush one +second after a filesystem update has been done; this is controlled with the +\texttt{journal\_flush\_delay} option, which takes a parameter in milliseconds. + +Filesystem sync and fsync operations issue journal flushes; this can be disabled +with the \texttt{journal\_flush\_disabled} option - the +\texttt{journal\_flush\_delay} option will still apply, and in the event of a +system crash we will never lose more than (by default) one second of work. This +option may be useful on a personal workstation or laptop, and perhaps less +appropriate on a server. + +The journal reclaim thread runs in the background, kicking off btree node writes +and btree key cache flushes to free up space in the journal. Even in the absence +of space pressure it will run slowly in the background: this is controlled by +the \texttt{journal\_reclaim\_delay} parameter, with a default of 100 +milliseconds. + +The journal should be sized sufficiently that bursts of activity do not fill up +the journal too quickly; also, a larger journal mean that we can queue up larger +btree writes. The \texttt{bcachefs device resize-journal} can be used for +resizing the journal on disk on a particular device - it can be used on a +mounted or unmounted filesystem. + +In the future, we should implement a method to see how much space is currently +utilized in the journal. + +\subsection{Device management} + +\subsubsection{Filesystem resize} + +A filesystem can be resized on a particular device with the +\texttt{bcachefs device resize} subcommand. Currently only growing is supported, +not shrinking. + +\subsubsection{Device add/removal} + +The following subcommands exist for adding and removing devices from a mounted +filesystem: +\begin{itemize} + \item \texttt{bcachefs device add}: Formats and adds a new device to an + existing filesystem. + \item \texttt{bcachefs device remove}: Permenantly removes a device from + an existing filesystem. + \item \texttt{bcachefs device online}: Connects a device to a running + filesystem that was mounted without it (i.e. in degraded mode) + \item \texttt{bcachefs device offline}: Disconnects a device from a + mounted filesystem without removing it. + \item \texttt{bcachefs device evacuate}: Migrates data off of a + particular device to prepare for removal, setting it read-only + if necessary. + \item \texttt{bcachefs device set-state}: Changes the state of a member + device: one of rw (readwrite), ro (readonly), failed, or spare. + + A failed device is considered to have 0 durability, and replicas + on that device won't be counted towards the number of replicas + an extent should have by rereplicate - however, bcachefs will + still attempt to read from devices marked as failed. +\end{itemize} + +The \texttt{bcachefs device remove}, \texttt{bcachefs device offline} and +\texttt{bcachefs device set-state} commands take force options for when they +would leave the filesystem degraded or with data missing. Todo: regularize and +improve those options. + +\subsection{Data management} + +\subsubsection{Data rereplicate} + +The \texttt{bcachefs data rereplicate} command may be used to scan for extents +that have insufficient replicas and write additional replicas, e.g. after a +device has been removed from a filesystem or after replication has been enabled +or increased. + +\subsubsection{Rebalance} + +To be implemented: a command for moving data between devices to equalize usage +on each device. Not normally required because the allocator attempts to equalize +usage across devices as it stripes, but can be necessary in certain scenarios - +i.e. when a two-device filesystem with replication enabled that is very full has +a third device added. + +\subsubsection{Scrub} + +To be implemented: a command for reading all data within a filesystem and +ensuring that checksums are valid, fixing bitrot when a valid copy can be found. + +\section{Options} + +Most bcachefs options can be set filesystem wide, and a significant subset can +also be set on inodes (files and directories), overriding the global defaults. +Filesystem wide options may be set when formatting, when mounting, or at runtime +via \texttt{/sys/fs/bcachefs//options/}. When set at runtime via sysfs the +persistent options in the superblock are updated as well; when options are +passed as mount parameters the persistent options are unmodified. + +\subsection{File and directory options} + +Options set on inodes (files and directories) are automatically inherited by +their descendants, and inodes also record whether a given option was explicitly +set or inherited from their parent. When renaming a directory would cause +inherited attributes to change we fail the rename with -EXDEV, causing userspace +to do the rename file by file so that inherited attributes stay consistent. + +Inode options are available as extended attributes. The options that have been +explicitly set are available under the \texttt{bcachefs} namespace, and the effective +options (explicitly set and inherited options) are available under the +\texttt{bcachefs\_effective} namespace. Examples of listing options with the +getfattr command: + +\begin{quote} \begin{verbatim} +$ getfattr -d -m '^bcachefs\.' filename +$ getfattr -d -m '^bcachefs_effective\.' filename +\end{verbatim} \end{quote} + +Options may be set via the extended attribute interface, but it is preferable to +use the \texttt{bcachefs setattr} command as it will correctly propagate options +recursively. + +\subsection{Full option list} + +\begin{tabbing} +\hspace{0.2in} \= \kill + \texttt{block\_size} \` \textbf{format} \\ + \> \parbox{4.3in}{Filesystem block size (default 4k)} \\ \\ + + \texttt{btree\_node\_size} \` \textbf{format} \\ + \> Btree node size, default 256k \\ \\ + + \texttt{errors} \` \textbf{format,mount,rutime} \\ + \> Action to take on filesystem error \\ \\ + + \texttt{metadata\_replicas} \` \textbf{format,mount,runtime} \\ + \> Number of replicas for metadata (journal and btree) \\ \\ + + \texttt{data\_replicas} \` \textbf{format,mount,runtime,inode} \\ + \> Number of replicas for user data \\ \\ + + \texttt{replicas} \` \textbf{format} \\ + \> Alias for both metadata\_replicas and data\_replicas \\ \\ + + \texttt{metadata\_checksum} \` \textbf{format,mount,runtime} \\ + \> Checksum type for metadata writes \\ \\ + + \texttt{data\_checksum} \` \textbf{format,mount,runtime,inode} \\ + \> Checksum type for data writes \\ \\ + + \texttt{compression} \` \textbf{format,mount,runtime,inode} \\ + \> Compression type \\ \\ + + \texttt{background\_compression} \` \textbf{format,mount,runtime,inode} \\ + \> Background compression type \\ \\ + + \texttt{str\_hash} \` \textbf{format,mount,runtime,inode} \\ + \> Hash function for string hash tables (directories and xattrs) \\ \\ + + \texttt{metadata\_target} \` \textbf{format,mount,runtime,inode} \\ + \> Preferred target for metadata writes \\ \\ + + \texttt{foreground\_target} \` \textbf{format,mount,runtime,inode} \\ + \> Preferred target for foreground writes \\ \\ + + \texttt{background\_target} \` \textbf{format,mount,runtime,inode} \\ + \> Target for data to be moved to in the background \\ \\ + + \texttt{promote\_target} \` \textbf{format,mount,runtime,inode} \\ + \> Target for data to be copied to on read \\ \\ + + \texttt{erasure\_code} \` \textbf{format,mount,runtime,inode} \\ + \> Enable erasure coding \\ \\ + + \texttt{inodes\_32bit} \` \textbf{format,mount,runtime} \\ + \> Restrict new inode numbers to 32 bits \\ \\ + + \texttt{shard\_inode\_numbers} \` \textbf{format,mount,runtime} \\ + \> Use CPU id for high bits of new inode numbers. \\ \\ + + \texttt{wide\_macs} \` \textbf{format,mount,runtime} \\ + \> Store full 128 bit cryptographic MACs (default 80) \\ \\ + + \texttt{inline\_data} \` \textbf{format,mount,runtime} \\ + \> Enable inline data extents (default on) \\ \\ + + \texttt{journal\_flush\_delay} \` \textbf{format,mount,runtime} \\ + \> Delay in milliseconds before automatic journal commit (default 1000) \\ \\ + + \texttt{journal\_flush\_disabled}\`\textbf{format,mount,runtime} \\ + \> \begin{minipage}{4.3in}Disables journal flush on sync/fsync. + \texttt{journal\_flush\_delay} remains in effect, thus with the + default setting not more than 1 second of work will be lost. + \end{minipage} \\ \\ + + \texttt{journal\_reclaim\_delay}\` \textbf{format,mount,runtime} \\ + \> Delay in milliseconds before automatic journal reclaim \\ \\ + + \texttt{acl} \` \textbf{format,mount} \\ + \> Enable POSIX ACLs \\ \\ + + \texttt{usrquota} \` \textbf{format,mount} \\ + \> Enable user quotas \\ \\ + + \texttt{grpquota} \` \textbf{format,mount} \\ + \> Enable group quotas \\ \\ + + \texttt{prjquota} \` \textbf{format,mount} \\ + \> Enable project quotas \\ \\ + + \texttt{degraded} \` \textbf{mount} \\ + \> Allow mounting with data degraded \\ \\ + + \texttt{very\_degraded} \` \textbf{mount} \\ + \> Allow mounting with data missing \\ \\ + + \texttt{verbose} \` \textbf{mount} \\ + \> Extra debugging info during mount/recovery \\ \\ + + \texttt{fsck} \` \textbf{mount} \\ + \> Run fsck during mount \\ \\ + + \texttt{fix\_errors} \` \textbf{mount} \\ + \> Fix errors without asking during fsck \\ \\ + + \texttt{ratelimit\_errors} \` \textbf{mount} \\ + \> Ratelimit error messages during fsck \\ \\ + + \texttt{read\_only} \` \textbf{mount} \\ + \> Mount in read only mode \\ \\ + + \texttt{nochanges} \` \textbf{mount} \\ + \> Issue no writes, even for journal replay \\ \\ + + \texttt{norecovery} \` \textbf{mount} \\ + \> Don't replay the journal (not recommended) \\ \\ + + \texttt{noexcl} \` \textbf{mount} \\ + \> Don't open devices in exclusive mode \\ \\ + + \texttt{version\_upgrade} \` \textbf{mount} \\ + \> Upgrade on disk format to latest version \\ \\ + + \texttt{discard} \` \textbf{device} \\ + \> Enable discard/TRIM support \\ \\ +\end{tabbing} + +\subsection{Error actions} +The \texttt{errors} option is used for inconsistencies that indicate some sort +of a bug. Valid error actions are: +\begin{description} + \item[{\tt continue}] Log the error but continue normal operation + \item[{\tt ro}] Emergency read only, immediately halting any changes + to the filesystem on disk + \item[{\tt panic}] Immediately halt the entire machine, printing a + backtrace on the system console +\end{description} + +\subsection{Checksum types} +Valid checksum types are: +\begin{description} + \item[{\tt none}] + \item[{\tt crc32c}] (default) + \item[{\tt crc64}] +\end{description} + +\subsection{Compression types} +Valid compression types are: +\begin{description} + \item[{\tt none}] (default) + \item[{\tt lz4}] + \item[{\tt gzip}] + \item[{\tt zstd}] +\end{description} + +\subsection{String hash types} +Valid hash types for string hash tables are: +\begin{description} + \item[{\tt crc32c}] + \item[{\tt crc64}] + \item[{\tt siphash}] (default) +\end{description} + +\section{Debugging tools} + +\subsection{Sysfs interface} + +Mounted filesystems are available in sysfs at \texttt{/sys/fs/bcachefs//} +with various options, performance counters and internal debugging aids. + +\subsubsection{Options} + +Filesystem options may be viewed and changed via \\ +\texttt{/sys/fs/bcachefs//options/}, and settings changed via sysfs will +be persistently changed in the superblock as well. + +\subsubsection{Time stats} + +bcachefs tracks the latency and frequency of various operations and events, with +quantiles for latency/duration in the +\texttt{/sys/fs/bcachefs//time\_stats/} directory. + +\begin{description} + \item \texttt{blocked\_allocate} \\ + Tracks when allocating a bucket must wait because none are + immediately available, meaning the copygc thread is not keeping + up with evacuating mostly empty buckets or the allocator thread + is not keeping up with invalidating and discarding buckets. + + \item \texttt{blocked\_allocate\_open\_bucket} \\ + Tracks when allocating a bucket must wait because all of our + handles for pinning open buckets are in use (we statically + allocate 1024). + + \item \texttt{blocked\_journal} \\ + Tracks when getting a journal reservation must wait, either + because journal reclaim isn't keeping up with reclaiming space + in the journal, or because journal writes are taking too long to + complete and we already have too many in flight. + + \item \texttt{btree\_gc} \\ + Tracks when the btree\_gc code must walk the btree at runtime - + for recalculating the oldest outstanding generation number of + every bucket in the btree. + + \item \texttt{btree\_lock\_contended\_read} + \item \texttt{btree\_lock\_contended\_intent} + \item \texttt{btree\_lock\_contended\_write} \\ + Track when taking a read, intent or write lock on a btree node + must block. + + \item \texttt{btree\_node\_mem\_alloc} \\ + Tracks the total time to allocate memory in the btree node cache + for a new btree node. + + \item \texttt{btree\_node\_split} \\ + Tracks btree node splits - when a btree node becomes full and is + split into two new nodes + + \item \texttt{btree\_node\_compact} \\ + Tracks btree node compactions - when a btree node becomes full + and needs to be compacted on disk. + + \item \texttt{btree\_node\_merge} \\ + Tracks when two adjacent btree nodes are merged. + + \item \texttt{btree\_node\_sort} \\ + Tracks sorting and resorting entire btree nodes in memory, + either after reading them in from disk or for compacting prior + to creating a new sorted array of keys. + + \item \texttt{btree\_node\_read} \\ + Tracks reading in btree nodes from disk. + + \item \texttt{btree\_interior\_update\_foreground} \\ + Tracks foreground time for btree updates that change btree + topology - i.e. btree node splits, compactions and merges; the + duration measured roughly corresponds to lock held time. + + \item \texttt{btree\_interior\_update\_total} \\ + Tracks time to completion for topology changing btree updates; + first they have a foreground part that updates btree nodes in + memory, then after the new nodes are written there is a + transaction phase that records an update to an interior node or + a new btree root as well as changes to the alloc btree. + + \item \texttt{data\_read} \\ + Tracks the core read path - looking up a request in the extents + (and possibly also reflink) btree, allocating bounce buffers if + necessary, issuing reads, checksumming, decompressing, decrypting, + and delivering completions. + + \item \texttt{data\_write} \\ + Tracks the core write path - allocating space on disk for a new + write, allocating bounce buffers if necessary, + compressing, encrypting, checksumming, issuing writes, and + updating the extents btree to point to the new data. + + \item \texttt{data\_promote} \\ + Tracks promote operations, which happen when a read operation + writes an additional cached copy of an extent to + \texttt{promote\_target}. This is done asynchronously from the + original read. + + \item \texttt{journal\_flush\_write} \\ + Tracks writing of flush journal entries to disk, which first + issue cache flush operations to the underlying devices then + issue the journal writes as FUA writes. Time is tracked starting + from after all journal reservations have released their + references or the completion of the previous journal write. + + \item \texttt{journal\_noflush\_write} \\ + Tracks writing of non-flush journal entries to disk, which do + not issue cache flushes or FUA writes. + + \item \texttt{journal\_flush\_seq} \\ + Tracks time to flush a journal sequence number to disk by + filesystem sync and fsync operations, as well as the allocator + prior to reusing buckets when none that do not need flushing are + available. +\end{description} + +\subsubsection{Internals} + +\begin{description} + \item \texttt{btree\_cache} \\ + Shows information on the btree node cache: number of cached + nodes, number of dirty nodes, and whether the cannibalize lock + (for reclaiming cached nodes to allocate new nodes) is held. + + \item \texttt{dirty\_btree\_nodes} \\ + Prints information related to the interior btree node update + machinery, which is responsible for ensuring dependent btree + node writes are ordered correctly. + + For each dirty btree node, prints: + \begin{itemize} + \item Whether the \texttt{need\_write} flag is set + \item The level of the btree node + \item The number of sectors written + \item Whether writing this node is blocked, waiting for + other nodes to be written + \item Whether it is waiting on a btree\_update to + complete and make it reachable on-disk + \end{itemize} + + \item \texttt{btree\_key\_cache} \\ + Prints infromation on the btree key cache: number of freed keys + (which must wait for a sRCU barrier to complete before being + freed), number of cached keys, and number of dirty keys. + + \item \texttt{btree\_transactions} \\ + Lists each running btree transactions that has locks held, + listing which nodes they have locked and what type of lock, what + node (if any) the process is blocked attempting to lock, and + where the btree transaction was invoked from. + + \item \texttt{btree\_updates} \\ + Lists outstanding interior btree updates: the mode (nothing + updated yet, or updated a btree node, or wrote a new btree root, + or was reparented by another btree update), whether its new + btree nodes have finished writing, its embedded closure's + refcount (while nonzero, the btree update is still waiting), and + the pinned journal sequence number. + + \item \texttt{journal\_debug} \\ + Prints a variety of internal journal state. + + \item \texttt{journal\_pins} + Lists items pinning journal entries, preventing them from being + reclaimed. + + \item \texttt{new\_stripes} \\ + Lists new erasure-coded stripes being created. + + \item \texttt{stripes\_heap} \\ + Lists erasure-coded stripes that are available to be reused. + + \item \texttt{open\_buckets} \\ + Lists buckets currently being written to, along with data type + and refcount. + + \item \texttt{io\_timers\_read} \\ + \item \texttt{io\_timers\_write} \\ + Lists outstanding IO timers - timers that wait on total reads or + writes to the filesystem. + + \item \texttt{trigger\_journal\_flush} \\ + Echoing to this file triggers a journal commit. + + \item \texttt{trigger\_gc} \\ + Echoing to this file causes the GC code to recalculate each + bucket's oldest\_gen field. + + \item \texttt{prune\_cache} \\ + Echoing to this file prunes the btree node cache. + + \item \texttt{read\_realloc\_races} \\ + This counts events where the read path reads an extent and + discovers the bucket that was read from has been reused while + the IO was in flight, causing the read to be retried. + + \item \texttt{extent\_migrate\_done} \\ + This counts extents moved by the core move path, used by copygc + and rebalance. + + \item \texttt{extent\_migrate\_raced} \\ + This counts extents that the move path attempted to move but no + longer existed when doing the final btree update. +\end{description} + +\subsubsection{Unit and performance tests} + +Echoing into \texttt{/sys/fs/bcachefs//perf\_test} runs various low level +btree tests, some intended as unit tests and others as performance tests. The +syntax is +\begin{quote} \begin{verbatim} + echo > perf_test +\end{verbatim} \end{quote} + +When complete, the elapsed time will be printed in the dmesg log. The full list +of tests that can be run can be found near the bottom of +\texttt{fs/bcachefs/tests.c}. + +\subsection{Debugfs interface} + +The contents of every btree, as well as various internal per-btree-node +information, are available under \texttt{/sys/kernel/debug/bcachefs//}. + +For every btree, we have the following files: + +\begin{description} + \item \textit{btree\_name} \\ + Entire btree contents, one key per line + + \item \textit{btree\_name}\texttt{-formats} \\ + Information about each btree node: the size of the packed bkey + format, how full each btree node is, number of packed and + unpacked keys, and number of nodes and failed nodes in the + in-memory search trees. + + \item \textit{btree\_name}\texttt{-bfloat-failed} \\ + For each sorted set of keys in a btree node, we construct a + binary search tree in eytzinger layout with compressed keys. + Sometimes we aren't able to construct a correct compressed + search key, which results in slower lookups; this file lists the + keys that resulted in these failed nodes. +\end{description} + +\subsection{Listing and dumping filesystem metadata} + +\subsubsection{bcachefs show-super} + +This subcommand is used for examining and printing bcachefs superblocks. It +takes two optional parameters: +\begin{description} + \item \texttt{-l}: Print superblock layout, which records the amount of + space reserved for the superblock and the locations of the + backup superblocks. + \item \texttt{-f, --fields=(fields)}: List of superblock sections to + print, \texttt{all} to print all sections. +\end{description} + +\subsubsection{bcachefs list} + +This subcommand gives access to the same functionality as the debugfs interface, +listing btree nodes and contents, but for offline filesystems. + +\subsubsection{bcachefs list\_journal} + +This subcommand lists the contents of the journal, which primarily records btree +updates ordered by when they occured. + +\subsubsection{bcachefs dump} + +This subcommand can dump all metadata in a filesystem (including multi device +filesystems) as qcow2 images: when encountering issues that \texttt{fsck} can +not recover from and need attention from the developers, this makes it possible +to send the developers only the required metadata. Encrypted filesystems must +first be unlocked with \texttt{bcachefs remove-passphrase}. + +\section{ioctl interface} + +This section documents bcachefs-specific ioctls: + +\begin{description} + \item \texttt{BCH\_IOCTL\_QUERY\_UUID} \\ + Returs the UUID of the filesystem: used to find the sysfs + directory given a path to a mounted filesystem. + + \item \texttt{BCH\_IOCTL\_FS\_USAGE} \\ + Queries filesystem usage, returning global counters and a list + of counters by \texttt{bch\_replicas} entry. + + \item \texttt{BCH\_IOCTL\_DEV\_USAGE} \\ + Queries usage for a particular device, as bucket and sector + counts broken out by data type. + + \item \texttt{BCH\_IOCTL\_READ\_SUPER} \\ + Returns the filesystem superblock, and optionally the superblock + for a particular device given that device's index. + + \item \texttt{BCH\_IOCTL\_DISK\_ADD} \\ + Given a path to a device, adds it to a mounted and running + filesystem. The device must already have a bcachefs superblock; + options and parameters are read from the new device's superblock + and added to the member info section of the existing + filesystem's superblock. + + \item \texttt{BCH\_IOCTL\_DISK\_REMOVE} \\ + Given a path to a device or a device index, attempts to remove + it from a mounted and running filesystem. This operation + requires walking the btree to remove all references to this + device, and may fail if data would become degraded or lost, + unless appropriate force flags are set. + + \item \texttt{BCH\_IOCTL\_DISK\_ONLINE} \\ + Given a path to a device that is a member of a running + filesystem (in degraded mode), brings it back online. + + \item \texttt{BCH\_IOCTL\_DISK\_OFFLINE} \\ + Given a path or device index of a device in a multi device + filesystem, attempts to close it without removing it, so that + the device may be re-added later and the contents will still be + available. + + \item \texttt{BCH\_IOCTL\_DISK\_SET\_STATE} \\ + Given a path or device index of a device in a multi device + filesystem, attempts to set its state to one of read-write, + read-only, failed or spare. Takes flags to force if the + filesystem would become degraded. + + \item \texttt{BCH\_IOCTL\_DISK\_GET\_IDX} \\ + \item \texttt{BCH\_IOCTL\_DISK\_RESIZE} \\ + \item \texttt{BCH\_IOCTL\_DISK\_RESIZE\_JOURNAL} \\ + \item \texttt{BCH\_IOCTL\_DATA} \\ + Starts a data job, which walks all data and/or metadata in a + filesystem performing, performaing some operation on each btree + node and extent. Returns a file descriptor which can be read + from to get the current status of the job, and closing the file + descriptor (i.e. on process exit stops the data job. + + \item \texttt{BCH\_IOCTL\_SUBVOLUME\_CREATE} \\ + \item \texttt{BCH\_IOCTL\_SUBVOLUME\_DESTROY} \\ + \item \texttt{BCHFS\_IOC\_REINHERIT\_ATTRS} \\ +\end{description} + +\section{On disk format} + +\subsection{Superblock} + +The superblock is the first thing to be read when accessing a bcachefs +filesystem. It is located 4kb from the start of the device, with redundant +copies elsewhere - typically one immediately after the first superblock, and one +at the end of the device. + +The \texttt{bch\_sb\_layout} records the amount of space reserved for the +superblock as well as the locations of all the superblocks. It is included with +every superblock, and additionally written 3584 bytes from the start of the +device (512 bytes before the first superblock). + +Most of the superblock is identical across each device. The exceptions are the +\texttt{dev\_idx} field, and the journal section which gives the location of the +journal. + +The main section of the superblock contains UUIDs, version numbers, number of +devices within the filesystem and device index, block size, filesystem creation +time, and various options and settings. The superblock also has a number of +variable length sections: + +\begin{description} + \item \texttt{BCH\_SB\_FIELD\_journal} \\ + List of buckets used for the journal on this device. + + \item \texttt{BCH\_SB\_FIELD\_members} \\ + List of member devices, as well as per-device options and + settings, including bucket size, number of buckets and time when + last mounted. + + \item \texttt{BCH\_SB\_FIELD\_crypt} \\ + Contains the main chacha20 encryption key, encrypted by the + user's passphrase, as well as key derivation function settings. + + \item \texttt{BCH\_SB\_FIELD\_replicas} \\ + Contains a list of replica entries, which are lists of devices + that have extents replicated across them. + + \item \texttt{BCH\_SB\_FIELD\_quota} \\ + Contains timelimit and warnlimit fields for each quota type + (user, group and project) and counter (space, inodes). + + \item \texttt{BCH\_SB\_FIELD\_disk\_groups} \\ + Formerly referred to as disk groups (and still is throughout the + code); this section contains device label strings and records + the tree structure of label paths, allowing a label once parsed + to be referred to by integer ID by the target options. + + \item \texttt{BCH\_SB\_FIELD\_clean} \\ + When the filesystem is clean, this section contains a list of + journal entries that are normally written with each journal + write (\texttt{struct jset}): btree roots, as well as filesystem + usage and read/write counters (total amount of data read/written + to this filesystem). This allows reading the journal to be + skipped after clean shutdowns. +\end{description} + +\subsection{Journal} + +Every journal write (\texttt{struct jset}) contains a list of entries: +\texttt{struct jset\_entry}. Below are listed the various journal entry types. + +\begin{description} + \item \texttt{BCH\_JSET\_ENTRY\_btree\_key} \\ + This entry type is used to record every btree update that + happens. It contains one or more btree keys (\texttt{struct + bkey}), and the \texttt{btree\_id} and \texttt{level} fields of + \texttt{jset\_entry} record the btree ID and level the key + belongs to. + + \item \texttt{BCH\_JSET\_ENTRY\_btree\_root} \\ + This entry type is used for pointers btree roots. In the current + implementation, every journal write still records every btree + root, although that is subject to change. A btree root is a bkey + of type \texttt{KEY\_TYPE\_btree\_ptr\_v2}, and the btree\_id + and level fields of \texttt{jset\_entry} record the btree ID and + depth. + + \item \texttt{BCH\_JSET\_ENTRY\_clock} \\ + Records IO time, not wall clock time - i.e. the amount of reads + and writes, in 512 byte sectors since the filesystem was + created. + + \item \texttt{BCH\_JSET\_ENTRY\_usage} \\ + Used for certain persistent counters: number of inodes, current + maximum key version, and sectors of persistent reservations. + + \item \texttt{BCH\_JSET\_ENTRY\_data\_usage} \\ + Stores replica entries with a usage counter, in sectors. + + \item \texttt{BCH\_JSET\_ENTRY\_dev\_usage} \\ + Stores usage counters for each device: sectors used and buckets + used, broken out by each data type. +\end{description} + +\subsection{Btrees} + +\subsection{Btree keys} + +\begin{description} + \item \texttt{KEY\_TYPE\_deleted} + \item \texttt{KEY\_TYPE\_whiteout} + \item \texttt{KEY\_TYPE\_error} + \item \texttt{KEY\_TYPE\_cookie} + \item \texttt{KEY\_TYPE\_hash\_whiteout} + \item \texttt{KEY\_TYPE\_btree\_ptr} + \item \texttt{KEY\_TYPE\_extent} + \item \texttt{KEY\_TYPE\_reservation} + \item \texttt{KEY\_TYPE\_inode} + \item \texttt{KEY\_TYPE\_inode\_generation} + \item \texttt{KEY\_TYPE\_dirent} + \item \texttt{KEY\_TYPE\_xattr} + \item \texttt{KEY\_TYPE\_alloc} + \item \texttt{KEY\_TYPE\_quota} + \item \texttt{KEY\_TYPE\_stripe} + \item \texttt{KEY\_TYPE\_reflink\_p} + \item \texttt{KEY\_TYPE\_reflink\_v} + \item \texttt{KEY\_TYPE\_inline\_data} + \item \texttt{KEY\_TYPE\_btree\_ptr\_v2} + \item \texttt{KEY\_TYPE\_indirect\_inline\_data} + \item \texttt{KEY\_TYPE\_alloc\_v2} + \item \texttt{KEY\_TYPE\_subvolume} + \item \texttt{KEY\_TYPE\_snapshot} + \item \texttt{KEY\_TYPE\_inode\_v2} + \item \texttt{KEY\_TYPE\_alloc\_v3} +\end{description} + +\end{document} diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 537ab79..b1385a7 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, return nr; } -static void extent_sort_append(struct bch_fs *c, - struct bkey_format *f, - struct btree_nr_keys *nr, - struct bkey_packed **out, - struct bkey_s k) -{ - if (!bkey_deleted(k.k)) { - if (!bch2_bkey_pack_key(*out, k.k, f)) - memcpy_u64s_small(*out, k.k, BKEY_U64s); - - memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); - - btree_keys_account_key_add(nr, 0, *out); - *out = bkey_next(*out); - } -} - /* Sort + repack in a new format: */ struct btree_nr_keys bch2_sort_repack(struct bset *dst, struct btree *src, @@ -144,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, struct bkey_format *in_f = &src->format; struct bkey_packed *in, *out = vstruct_last(dst); struct btree_nr_keys nr; + bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); memset(&nr, 0, sizeof(nr)); @@ -151,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src, if (filter_whiteouts && bkey_deleted(in)) continue; - if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) + if (!transform) + bkey_copy(out, in); + else if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) out->format = KEY_FORMAT_LOCAL_BTREE; else bch2_bkey_unpack(src, (void *) out, in); @@ -165,47 +151,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -struct btree_nr_keys -bch2_sort_repack_merge(struct bch_fs *c, - struct bset *dst, struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_packed *out = vstruct_last(dst), *k_packed; - struct bkey_buf k; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - bch2_bkey_buf_init(&k); - - while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_deleted(k_packed)) - continue; - - /* - * NOTE: - * bch2_bkey_normalize may modify the key we pass it (dropping - * stale pointers) and we don't have a write lock on the src - * node; we have to make a copy of the entire key before calling - * normalize - */ - bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); - bch2_bkey_unpack(src, k.k, k_packed); - - if (filter_whiteouts && - bch2_bkey_normalize(c, bkey_i_to_s(k.k))) - continue; - - extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - bch2_bkey_buf_exit(&k, c); - return nr; -} - static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h index 1059996..79cf11d 100644 --- a/libbcachefs/bkey_sort.h +++ b/libbcachefs/bkey_sort.h @@ -37,11 +37,6 @@ struct btree_nr_keys bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -struct btree_nr_keys -bch2_sort_repack_merge(struct bch_fs *, - struct bset *, struct btree *, - struct btree_node_iter *, - struct bkey_format *, bool); unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *, bool); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index c19c3ac..9b22c5e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -391,16 +391,10 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_btree_node_iter_init_from_start(&src_iter, src); - if (btree_node_is_extents(src)) - nr = bch2_sort_repack_merge(c, btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - else - nr = bch2_sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); + nr = bch2_sort_repack(btree_bset_first(dst), + src, &src_iter, + &dst->format, + true); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index fc9d5ba..bdbb900 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -155,11 +155,19 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) * goes to 0, and it's safe because we have the node intent * locked: */ - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); + if (!b->c.lock.readers) + atomic64_sub(__SIX_VAL(read_lock, readers), + &b->c.lock.state.counter); + else + this_cpu_sub(*b->c.lock.readers, readers); + btree_node_lock_type(trans->c, b, SIX_LOCK_write); - atomic64_add(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); + + if (!b->c.lock.readers) + atomic64_add(__SIX_VAL(read_lock, readers), + &b->c.lock.state.counter); + else + this_cpu_add(*b->c.lock.readers, readers); } bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -369,19 +377,16 @@ bool __bch2_btree_node_lock(struct btree_trans *trans, if (six_trylock_type(&b->c.lock, type)) return true; -#ifdef CONFIG_BCACHEFS_DEBUG trans->locking_path_idx = path->idx; trans->locking_pos = pos; trans->locking_btree_id = path->btree_id; trans->locking_level = level; trans->locking = b; -#endif ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; -#ifdef CONFIG_BCACHEFS_DEBUG trans->locking = NULL; -#endif + if (ret) bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], start_time); @@ -2796,12 +2801,10 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -#ifdef CONFIG_BCACHEFS_DEBUG trans->pid = current->pid; mutex_lock(&c->btree_trans_lock); list_add(&trans->list, &c->btree_trans_list); mutex_unlock(&c->btree_trans_lock); -#endif } static void check_btree_paths_leaked(struct btree_trans *trans) @@ -2840,11 +2843,9 @@ void bch2_trans_exit(struct btree_trans *trans) check_btree_paths_leaked(trans); -#ifdef CONFIG_BCACHEFS_DEBUG mutex_lock(&c->btree_trans_lock); list_del(&trans->list); mutex_unlock(&c->btree_trans_lock); -#endif srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); @@ -2888,7 +2889,6 @@ bch2_btree_path_node_to_text(struct printbuf *out, bch2_bpos_to_text(out, btree_node_pos(_b, cached)); } -#ifdef CONFIG_BCACHEFS_DEBUG static bool trans_has_locks(struct btree_trans *trans) { struct btree_path *path; @@ -2898,11 +2898,9 @@ static bool trans_has_locks(struct btree_trans *trans) return true; return false; } -#endif void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) { -#ifdef CONFIG_BCACHEFS_DEBUG struct btree_trans *trans; struct btree_path *path; struct btree *b; @@ -2956,7 +2954,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) } } mutex_unlock(&c->btree_trans_lock); -#endif } void bch2_fs_btree_iter_exit(struct bch_fs *c) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 2c2e2f7..22dbbe3 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -366,7 +366,6 @@ struct btree_trans_commit_hook { struct btree_trans { struct bch_fs *c; -#ifdef CONFIG_BCACHEFS_DEBUG struct list_head list; struct btree *locking; unsigned locking_path_idx; @@ -374,7 +373,6 @@ struct btree_trans { u8 locking_btree_id; u8 locking_level; pid_t pid; -#endif unsigned long ip; int srcu_idx; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 295942e..95d1988 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -1271,22 +1271,24 @@ err: * When deleting, check if we need to emit a whiteout (because we're overwriting * something in an ancestor snapshot) */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) +static int need_whiteout_for_snapshot(struct btree_trans *trans, struct btree_iter *orig) { struct btree_iter iter; struct bkey_s_c k; - u32 snapshot = pos.snapshot; + u32 snapshot = orig->pos.snapshot; int ret; - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + if (!bch2_snapshot_parent(trans->c, snapshot)) return 0; - pos.snapshot++; + bch2_trans_copy_iter(&iter, orig); + iter.flags &= BTREE_ITER_FILTER_SNAPSHOTS; + iter.flags |= BTREE_ITER_ALL_SNAPSHOTS; - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (bkey_cmp(k.k->p, pos)) + bch2_btree_iter_advance(&iter); + + for_each_btree_key_continue_norestart(iter, 0, k, ret) { + if (bkey_cmp(k.k->p, orig->pos)) break; if (bch2_snapshot_is_ancestor(trans->c, snapshot, @@ -1312,6 +1314,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); BUG_ON(bpos_cmp(k->k.p, iter->path->pos)); + BUG_ON(bpos_cmp(k->k.p, iter->pos)); n = (struct btree_insert_entry) { .flags = flags, @@ -1332,7 +1335,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (bkey_deleted(&n.k->k) && (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - int ret = need_whiteout_for_snapshot(trans, n.btree_id, n.k->k.p); + int ret = need_whiteout_for_snapshot(trans, iter); if (unlikely(ret < 0)) return ret; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 4ad843f..9cdd03f 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -2705,7 +2705,8 @@ int bch2_truncate(struct user_namespace *mnt_userns, U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); - BUG_ON(!inode->v.i_size && inode->v.i_blocks); + WARN_ON(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal)); if (unlikely(ret)) goto err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index d5d32bf..3f51eda 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -262,21 +262,6 @@ static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) return ret; } -static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); - - if (!fs_usage) - return -ENOMEM; - - bch2_fs_usage_to_text(out, c, fs_usage); - - percpu_up_read(&c->mark_lock); - - kfree(fs_usage); - return 0; -} - static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; @@ -386,9 +371,6 @@ SHOW(bch2_fs) /* Debugging: */ - if (attr == &sysfs_alloc_debug) - return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; - if (attr == &sysfs_journal_debug) { bch2_journal_debug_to_text(&out, &c->journal); return out.pos - buf; @@ -580,7 +562,6 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { - &sysfs_alloc_debug, &sysfs_journal_debug, &sysfs_journal_pins, &sysfs_btree_updates, @@ -588,17 +569,21 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, + &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, + &sysfs_io_timers_read, + &sysfs_io_timers_write, + + &sysfs_trigger_journal_flush, + &sysfs_trigger_gc, + &sysfs_prune_cache, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, &sysfs_extent_migrate_raced, - &sysfs_trigger_journal_flush, - &sysfs_trigger_gc, &sysfs_gc_gens_pos, - &sysfs_prune_cache, &sysfs_copy_gc_enabled, &sysfs_copy_gc_wait, @@ -607,11 +592,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - &sysfs_new_stripes, - - &sysfs_io_timers_read, - &sysfs_io_timers_write, - &sysfs_data_op_data_progress, &sysfs_internal_uuid, diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index dfd8c43..478c00a 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -14,12 +14,14 @@ static void delete_test_keys(struct bch_fs *c) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_extents, - POS(0, 0), POS(0, U64_MAX), + SPOS(0, 0, U32_MAX), + SPOS(0, U64_MAX, U32_MAX), NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS(0, 0), POS(0, U64_MAX), + SPOS(0, 0, U32_MAX), + SPOS(0, U64_MAX, U32_MAX), NULL); BUG_ON(ret); } @@ -541,10 +543,11 @@ static int rand_lookup(struct bch_fs *c, u64 nr) u64 i; bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(&iter, POS(0, test_rand())); + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); @@ -567,7 +570,7 @@ static int rand_mixed_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); k = bch2_btree_iter_peek(iter); ret = bkey_err(k); @@ -594,7 +597,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr) u64 i, rand; bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { rand = test_rand(); @@ -673,7 +677,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter.pos; @@ -703,7 +707,8 @@ static int seq_lookup(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ret) ; bch2_trans_iter_exit(&trans, &iter); @@ -720,7 +725,8 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), BTREE_ITER_INTENT, k, ret) { struct bkey_i_cookie u; @@ -745,8 +751,7 @@ static int seq_delete(struct bch_fs *c, u64 nr) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS(0, 0), POS(0, U64_MAX), - NULL); + SPOS(0, 0, U32_MAX), POS_MAX, NULL); if (ret) bch_err(c, "error in seq_delete: %i", ret); return ret;